In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA #Principal Component Analysis
from sklearn.manifold import TSNE #T-Distributed Stochastic Neighbor Embedding
from sklearn.cluster import KMeans #K-Means Clustering

#plotly imports
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

In [2]:
survey_data = pd.read_csv('AllStudyData.csv')
survey_data.dropna(inplace = True)

In [3]:
survey_data.describe().round(2)

Unnamed: 0,Q01 List Maker,Q02 Initial Day Planner,Q03 Scheduler,Q04 Daily Goal Setter,Q05 Daily Planner,Q06 Next Week Accomplishments,Q07 Priority Setting Honoring,Q08 Unable to No,Q09 Own Time,Q10 Over Groomer,...,MAJOR,AGE,NSUGPA,HSGPA,ACT MATH,ACT ENG,ACT SCI,ACT READ,ACT COMPOSITE,ROWID
count,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,...,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0
mean,3.13,3.6,3.6,2.51,3.16,3.65,3.88,3.29,3.69,3.91,...,192.8,20.58,2.66,3.4,20.8,22.25,22.12,23.03,21.76,647.86
std,1.22,1.04,1.14,1.19,1.15,1.05,0.94,1.07,0.99,0.96,...,180.62,4.11,1.26,0.48,3.89,4.89,3.84,5.3,3.89,377.59
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,101.0,18.0,0.0,1.64,11.0,9.0,9.0,10.0,12.0,1.0
25%,2.0,3.0,3.0,2.0,2.0,3.0,3.0,3.0,3.0,3.0,...,102.0,18.0,2.07,3.1,17.0,19.0,20.0,19.0,19.0,306.0
50%,3.0,4.0,4.0,2.0,3.0,4.0,4.0,3.0,4.0,4.0,...,110.0,19.0,3.01,3.48,20.0,22.0,22.0,23.0,22.0,630.0
75%,4.0,4.0,4.0,3.0,4.0,4.0,5.0,4.0,4.0,5.0,...,110.0,21.0,3.6,3.81,24.0,25.0,24.0,27.0,24.0,999.5
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,820.0,46.0,4.0,4.0,33.0,36.0,36.0,36.0,34.0,1275.0


In [4]:
survey_data['COVID'].value_counts()

COVID
PRECOVID     341
COVID        308
POSTCOVID    118
Name: count, dtype: int64

In [5]:
survey_data['MAJOR CODE'] = survey_data['MAJOR CODE'].map({
    'BUAD': 0,
    'CIS': 1,
    'ACCT': 2,
    'OTHER': 3
})

survey_data['GENDER'] = survey_data['GENDER'].map({'M': 0,
                                                   'F': 1})

survey_data['ETHNICITY GROUP'] = survey_data['ETHNICITY GROUP'].map({
    'White': 0,
    'African American': 1,
    'Other': 2
})

survey_data['CLASS GROUP'] = survey_data['CLASS GROUP'].map({
    'FRSO': 0,
    'JRSR': 1,
})

survey_data['COVID'] = survey_data['COVID'].map({
    'PRECOVID': 0,
    'COVID': 1,
    'POSTCOVID': 2
})

In [6]:
x = survey_data[['MAJOR CODE', 'GENDER', 'ETHNICITY GROUP', 'CLASS GROUP', 'COVID']]
r = survey_data[['SRP', 'TA', 'LRP']]

In [7]:
scaler = StandardScaler()
x = pd.DataFrame(scaler.fit_transform(x))
x.rename(columns={0: 'MAJOR CODE', 1: 'GENDER', 2: 'ETHNICITY GROUP', 3: 'CLASS GROUP', 4: 'COVID'}, inplace = True)

In [8]:
X = pd.concat([x, r], axis=1, join='inner')

In [9]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
clusters = kmeans.predict(X)
X["Cluster"] = clusters





In [10]:
X

Unnamed: 0,MAJOR CODE,GENDER,ETHNICITY GROUP,CLASS GROUP,COVID,SRP,TA,LRP,Cluster
0,0.672836,0.950382,0.572522,1.416984,-0.989379,30,22,17,1
3,-0.970684,-1.052209,1.932041,1.416984,-0.989379,17,18,13,2
5,-0.970684,0.950382,-0.786997,1.416984,-0.989379,30,27,23,1
7,-0.970684,0.950382,-0.786997,1.416984,-0.989379,27,27,17,1
9,1.494596,0.950382,-0.786997,-0.705724,0.405573,31,23,16,1
...,...,...,...,...,...,...,...,...,...
759,-0.970684,-1.052209,-0.786997,-0.705724,1.800524,22,22,16,0
760,-0.970684,-1.052209,0.572522,-0.705724,1.800524,25,24,16,0
761,-0.970684,-1.052209,1.932041,-0.705724,1.800524,14,23,17,2
762,-0.148924,-1.052209,0.572522,-0.705724,1.800524,22,20,13,0


In [11]:
plotX = pd.DataFrame(np.array(X.sample(451)))
plotX.columns = X.columns

In [12]:
#PCA with one principal component
pca_1d = PCA(n_components=1)

#PCA with two principal components
pca_2d = PCA(n_components=2)

#PCA with three principal components
pca_3d = PCA(n_components=3)
#This DataFrame holds that single principal component mentioned above
PCs_1d = pd.DataFrame(pca_1d.fit_transform(plotX.drop(["Cluster"], axis=1)))

#This DataFrame contains the two principal components that will be used
#for the 2-D visualization mentioned above
PCs_2d = pd.DataFrame(pca_2d.fit_transform(plotX.drop(["Cluster"], axis=1)))

#And this DataFrame contains three principal components that will aid us
#in visualizing our clusters in 3-D
PCs_3d = pd.DataFrame(pca_3d.fit_transform(plotX.drop(["Cluster"], axis=1)))
PCs_1d.columns = ["PC1_1d"]

#"PC1_2d" means: 'The first principal component of the components created for 2-D visualization, by PCA.'
#And "PC2_2d" means: 'The second principal component of the components created for 2-D visualization, by PCA.'
PCs_2d.columns = ["PC1_2d", "PC2_2d"]

PCs_3d.columns = ["PC1_3d", "PC2_3d", "PC3_3d"]
plotX = pd.concat([plotX,PCs_1d,PCs_2d,PCs_3d], axis=1, join='inner')
plotX["dummy"] = 0
cluster0 = plotX[plotX["Cluster"] == 0]
cluster1 = plotX[plotX["Cluster"] == 1]
cluster2 = plotX[plotX["Cluster"] == 2]
init_notebook_mode(connected=True)

In [13]:
#Instructions for building the 1-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_1d"],
                    y = cluster0["dummy"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_1d"],
                    y = cluster1["dummy"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_1d"],
                    y = cluster2["dummy"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in One Dimension Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= '',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [14]:
#Instructions for building the 2-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter(
                    x = cluster0["PC1_2d"],
                    y = cluster0["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter(
                    x = cluster1["PC1_2d"],
                    y = cluster1["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter(
                    x = cluster2["PC1_2d"],
                    y = cluster2["PC2_2d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Two Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)

In [15]:
#Instructions for building the 3-D plot

#trace1 is for 'Cluster 0'
trace1 = go.Scatter3d(
                    x = cluster0["PC1_3d"],
                    y = cluster0["PC2_3d"],
                    z = cluster0["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 0",
                    marker = dict(color = 'rgba(255, 128, 255, 0.8)'),
                    text = None)

#trace2 is for 'Cluster 1'
trace2 = go.Scatter3d(
                    x = cluster1["PC1_3d"],
                    y = cluster1["PC2_3d"],
                    z = cluster1["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 1",
                    marker = dict(color = 'rgba(255, 128, 2, 0.8)'),
                    text = None)

#trace3 is for 'Cluster 2'
trace3 = go.Scatter3d(
                    x = cluster2["PC1_3d"],
                    y = cluster2["PC2_3d"],
                    z = cluster2["PC3_3d"],
                    mode = "markers",
                    name = "Cluster 2",
                    marker = dict(color = 'rgba(0, 255, 200, 0.8)'),
                    text = None)

data = [trace1, trace2, trace3]

title = "Visualizing Clusters in Three Dimensions Using PCA"

layout = dict(title = title,
              xaxis= dict(title= 'PC1',ticklen= 5,zeroline= False),
              yaxis= dict(title= 'PC2',ticklen= 5,zeroline= False)
             )

fig = dict(data = data, layout = layout)

iplot(fig)