# Setup

In [1]:
import pandas as pd
import random
import numpy as np
from scipy import linalg
import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn import mixture
from sklearn.decomposition import PCA

from mpl_toolkits.mplot3d import Axes3D  # noqa: F401 unused import
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sns

from sklearn.preprocessing import MinMaxScaler

# Clustering
from sklearn.cluster import KMeans, DBSCAN
from sklearn import preprocessing
from sklearn.metrics import silhouette_score

# Dimensionality reduction
from sklearn.manifold import TSNE
%matplotlib inline

In [2]:
data_file = "data-final.csv"
df = pd.read_csv(data_file, sep="\t")


In [3]:
df.head()

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,dateload,screenw,screenh,introelapse,testelapse,endelapse,IPC,country,lat_appx_lots_of_err,long_appx_lots_of_err
0,4.0,1.0,5.0,2.0,5.0,1.0,5.0,2.0,4.0,1.0,...,2016-03-03 02:01:01,768.0,1024.0,9.0,234.0,6,1,GB,51.5448,0.1991
1,3.0,5.0,3.0,4.0,3.0,3.0,2.0,5.0,1.0,5.0,...,2016-03-03 02:01:20,1360.0,768.0,12.0,179.0,11,1,MY,3.1698,101.706
2,2.0,3.0,4.0,4.0,3.0,2.0,1.0,3.0,2.0,5.0,...,2016-03-03 02:01:56,1366.0,768.0,3.0,186.0,7,1,GB,54.9119,-1.3833
3,2.0,2.0,2.0,3.0,4.0,2.0,2.0,4.0,1.0,4.0,...,2016-03-03 02:02:02,1920.0,1200.0,186.0,219.0,7,1,GB,51.75,-1.25
4,3.0,3.0,3.0,3.0,5.0,3.0,3.0,5.0,3.0,4.0,...,2016-03-03 02:02:57,1366.0,768.0,8.0,315.0,17,2,KE,1.0,38.0


In [4]:
list(df.columns)

['EXT1',
 'EXT2',
 'EXT3',
 'EXT4',
 'EXT5',
 'EXT6',
 'EXT7',
 'EXT8',
 'EXT9',
 'EXT10',
 'EST1',
 'EST2',
 'EST3',
 'EST4',
 'EST5',
 'EST6',
 'EST7',
 'EST8',
 'EST9',
 'EST10',
 'AGR1',
 'AGR2',
 'AGR3',
 'AGR4',
 'AGR5',
 'AGR6',
 'AGR7',
 'AGR8',
 'AGR9',
 'AGR10',
 'CSN1',
 'CSN2',
 'CSN3',
 'CSN4',
 'CSN5',
 'CSN6',
 'CSN7',
 'CSN8',
 'CSN9',
 'CSN10',
 'OPN1',
 'OPN2',
 'OPN3',
 'OPN4',
 'OPN5',
 'OPN6',
 'OPN7',
 'OPN8',
 'OPN9',
 'OPN10',
 'EXT1_E',
 'EXT2_E',
 'EXT3_E',
 'EXT4_E',
 'EXT5_E',
 'EXT6_E',
 'EXT7_E',
 'EXT8_E',
 'EXT9_E',
 'EXT10_E',
 'EST1_E',
 'EST2_E',
 'EST3_E',
 'EST4_E',
 'EST5_E',
 'EST6_E',
 'EST7_E',
 'EST8_E',
 'EST9_E',
 'EST10_E',
 'AGR1_E',
 'AGR2_E',
 'AGR3_E',
 'AGR4_E',
 'AGR5_E',
 'AGR6_E',
 'AGR7_E',
 'AGR8_E',
 'AGR9_E',
 'AGR10_E',
 'CSN1_E',
 'CSN2_E',
 'CSN3_E',
 'CSN4_E',
 'CSN5_E',
 'CSN6_E',
 'CSN7_E',
 'CSN8_E',
 'CSN9_E',
 'CSN10_E',
 'OPN1_E',
 'OPN2_E',
 'OPN3_E',
 'OPN4_E',
 'OPN5_E',
 'OPN6_E',
 'OPN7_E',
 'OPN8_E',
 'OPN9_E',
 '

# EDA

In [5]:
df.shape

(1015341, 110)

In [6]:
df = df[df['IPC'] == 1]
df.shape

(696845, 110)

In [7]:
total_time = df.iloc[:,50:100].sum(axis=1)


In [8]:
total_time.describe()

count    6.968450e+05
mean     4.886900e+05
std      7.587583e+06
min     -7.873258e+07
25%      1.663530e+05
50%      2.160290e+05
75%      2.973040e+05
max      2.147711e+09
dtype: float64

In [None]:
# fig = go.Figure(data=[go.Histogram(x=total_time)])
# fig.update_layout(
#     title_text='Histogram of Total Time', # title of plot
#     xaxis_title_text='Total Time', # xaxis label,
# )
# fig.show()

In [None]:
# fig = go.Figure(data=[go.Box(x=total_time, name="Total Time")])
# fig.update_layout(
#     title_text='Boxplot of Total Time', # title of plot
#     xaxis_title_text='Count', # xaxis label,
# )
# fig.show()

In [15]:
df_q = df.iloc[:, :50]
df_q['total_time'] = df.iloc[:,50:100].sum(axis=1)
df_q.shape


(696845, 51)

In [16]:
df_q_clean = df_q.dropna()
df_q_clean.shape

(695704, 51)

In [17]:
df_q_clean = df_q_clean.replace(0, np.nan)
df_q_clean = df_q_clean.dropna()
df_q_clean.shape

(601863, 51)

In [18]:
df_q_clean = df_q_clean[df_q_clean['total_time']>10000]
df_q_clean.shape

(601712, 51)

In [19]:
df_q_clean = df_q_clean[df_q_clean['total_time']<1000000]
df_q_clean.shape

(585977, 51)

In [20]:
df_q_clean.to_csv("time_cleaned.csv")

In [12]:
df_q_clean.describe()

Unnamed: 0,EXT1,EXT2,EXT3,EXT4,EXT5,EXT6,EXT7,EXT8,EXT9,EXT10,...,OPN1,OPN2,OPN3,OPN4,OPN5,OPN6,OPN7,OPN8,OPN9,OPN10
count,603322.0,603322.0,603322.0,603322.0,603322.0,603322.0,603322.0,603322.0,603322.0,603322.0,...,603322.0,603322.0,603322.0,603322.0,603322.0,603322.0,603322.0,603322.0,603322.0,603322.0
mean,2.576876,2.848174,3.230747,3.219274,3.248504,2.424135,2.710867,3.469454,2.954209,3.621943,...,3.776963,2.021337,4.06645,1.951505,3.834374,1.878022,4.059081,3.284182,4.221016,3.997562
std,1.238025,1.306033,1.191672,1.205191,1.246861,1.21453,1.370031,1.239495,1.324913,1.263841,...,1.074955,1.083808,1.027144,1.058897,0.93027,1.072948,0.919496,1.212711,0.939487,0.982652
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,2.0,2.0,2.0,2.0,1.0,1.0,3.0,2.0,3.0,...,3.0,1.0,3.0,1.0,3.0,1.0,4.0,2.0,4.0,3.0
50%,3.0,3.0,3.0,3.0,3.0,2.0,3.0,4.0,3.0,4.0,...,4.0,2.0,4.0,2.0,4.0,2.0,4.0,3.0,4.0,4.0
75%,3.0,4.0,4.0,4.0,4.0,3.0,4.0,5.0,4.0,5.0,...,5.0,3.0,5.0,3.0,5.0,2.0,5.0,4.0,5.0,5.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [None]:
total_time = df.replace(0, np.nan).dropna().iloc[:,50:100].sum(axis=1)

fig = go.Figure(data=[go.Box(x=total_time, name="Total Time")])
fig.update_layout(
    title_text='Boxplot of Total Time', # title of plot
    xaxis_title_text='Count', # xaxis label,
)
fig.show()

In [None]:
total_time

In [None]:
negative_questions = [ 
    'EXT2','EXT4','EXT6','EXT8','EXT10',
    'EST2','EST4',
    'AGR1','AGR3','AGR5','AGR7',
    'CSN2','CSN4','CSN6','CSN8',
    'OPN2','OPN4','OPN6',
]

df_q_clean[negative_questions] = df_q_clean[negative_questions].replace({1:5, 2:4, 3:3, 4:2, 5:1})


In [None]:
df_q_clean.describe()

In [None]:
# from plotly.subplots import make_subplots
# import plotly.graph_objects as go

# fig = make_subplots(rows=2,cols=3,
#                     subplot_titles=("EXT", "EST"
#                                     , "AGR", "CSN", "OPN"))

# fig.add_trace(
#     go.Histogram(x=mean_df["EXT"]), row=1, col=1
# )

# fig.add_trace(
#     go.Histogram(x=mean_df["EST"]), row=1, col=2
# )
# fig.add_trace(
#     go.Histogram(x=mean_df["AGR"]), row=1, col=3
# )
# fig.add_trace(
#     go.Histogram(x=mean_df["CSN"]), row=2, col=1
# )
# fig.add_trace(
#     go.Histogram(x=mean_df["EXT"]), row=2, col=2
# )

# fig.show()

In [None]:
# Extraversion 
EXT = list(df_q_clean.columns[:10])

# Emotional Stability
EST = list(df_q_clean.columns[10:20])

# Agreeableness
AGR = list(df_q_clean.columns[20:30])

# Conscientiousness
CSN = list(df_q_clean.columns[30:40])

# Openness
OPN = list(df_q_clean.columns[40:50])


In [None]:
# from plotly.subplots import make_subplots
# import plotly.graph_objects as go

# fig = make_subplots(rows=5,cols=10,
#                     subplot_titles=EXT,
#                    shared_yaxes=True)
# colors = [
#     '#1f77b4',  # muted blue
#     '#ff7f0e',  # safety orange
#     '#2ca02c',  # cooked asparagus green
#     '#d62728',  # brick red
#     '#9467bd',  # muted purple
#     '#8c564b',  # chestnut brown
#     '#e377c2',  # raspberry yogurt pink
#     '#7f7f7f',  # middle gray
#     '#bcbd22',  # curry yellow-green
#     '#17becf'   # blue-teal
# ]
# for i in range(1,11):
#     fig.add_trace(
#         go.Histogram(x=df_q_clean["EXT{}".format(i)],
#                     marker_color=colors[0]), row=1, col=i
#     )
# for i in range(1,11):
#     fig.add_trace(
#         go.Histogram(x=df_q_clean["EST{}".format(i)],
#                     marker_color=colors[1]), row=2, col=i
#     )
    
# fig.update_layout(showlegend=False)
# fig.show()

In [None]:
mean_df = pd.DataFrame()

In [None]:
mean_df['EXT'] = df_q_clean[EXT].sum(axis=1)/10
mean_df['EST'] = df_q_clean[EST].sum(axis=1)/10
mean_df['AGR'] = df_q_clean[AGR].sum(axis=1)/10
mean_df['CSN'] = df_q_clean[CSN].sum(axis=1)/10
mean_df['OPN'] = df_q_clean[OPN].sum(axis=1)/10
mean_df.head()

In [None]:
# mean_df.to_csv("means.csv", index=False)

In [None]:
mean_df.corr()

In [None]:
np.round_(np.array(mean_df.corr()),2)

In [None]:
np.flip(np.round_(np.array(mean_df.corr()),2),axis=0)

In [None]:
import plotly.figure_factory as ff

z =np.flip(np.round_(np.array(mean_df.corr()),2),axis=0)

x = list(mean_df.columns)
y = list(mean_df.columns)[::-1]


fig = ff.create_annotated_heatmap(z, x=x, y=y)
fig.update_layout(title_text="Correlations")
fig.show()

# figuring out num of clusters

In [None]:
scores = [KMeans(n_clusters=i+2).fit(df_q_clean).inertia_ for i in range(10)]
sns.lineplot(np.arange(2, 12), scores)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of k-Means versus number of clusters")

In [None]:
scores_2 = [KMeans(n_clusters=i+2).fit(df_q_clean).inertia_ for i in range(10,20)]
sns.lineplot(np.arange(2, 22), scores + scores_2)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of k-Means versus number of clusters")

In [None]:
scores_3 = [KMeans(n_clusters=i+2).fit(df_q_clean).inertia_ for i in range(20,33)]
sns.lineplot(np.arange(2, 35), scores + scores_2 + scores_3)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of k-Means versus number of clusters")

## for first 50 columns

In [None]:
scores_q = [KMeans(n_clusters=i+2).fit(df_q_clean).inertia_ for i in range(10)]
sns.lineplot(np.arange(2, 12), scores_q)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of k-Means versus number of clusters")

In [None]:
scores_q_2 = [KMeans(n_clusters=i+2).fit(df_q_clean).inertia_ for i in range(10,20)]
sns.lineplot(np.arange(2, 22), scores + scores_q_2)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of k-Means versus number of clusters")

In [None]:
scores_q_3 = [KMeans(n_clusters=i+2).fit(df_q_clean).inertia_ for i in range(20,33)]
sns.lineplot(np.arange(2, 35), scores_q + scores_q_2 + scores_q_3)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of k-Means versus number of clusters")

In [None]:
scores + scores_2 + scores_3

## cosine k means

In [None]:
from sklearn import preprocessing
normalized_vectors = preprocessing.normalize(df_q_clean)
scores = []
for i in range(10):
    res = KMeans(n_clusters=i+2).fit(normalized_vectors).inertia_  
    scores.append(res)
    print(res)
sns.lineplot(np.arange(2, 12), scores)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of Cosine k-Means versus number of clusters")
# plt.savefig("intertia_cosine_kmeans.jpg", dpi=300)

In [None]:
normalized_vectors = preprocessing.normalize(df_q_clean)
scores_2 = []
for i in range(10,20):
    res = KMeans(n_clusters=i+2).fit(normalized_vectors).inertia_  
    scores_2.append(res)
#     print(res)
sns.lineplot(np.arange(2, 22), scores + scores_2)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of Cosine k-Means versus number of clusters")

In [None]:
normalized_vectors = preprocessing.normalize(df_q_clean)
scores_3 = []
for i in range(20,33):
    res = KMeans(n_clusters=i+2).fit(normalized_vectors).inertia_  
    scores_3.append(res)
    print(res)
sns.lineplot(np.arange(2, 35), scores + scores_2 + scores_3)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of Cosine k-Means versus number of clusters")

In [None]:
scores + scores_2 + scores_3

# prediction

https://scikit-learn.org/stable/modules/generated/sklearn.mixture.BayesianGaussianMixture.html#sklearn.mixture.BayesianGaussianMixture

In [None]:
X = np.array(df_q_clean.sample(frac=.001, random_state = 1))
X.shape

In [None]:
n_components = 5
dpgmm = mixture.BayesianGaussianMixture(n_components,
                                        covariance_type='full',
                                        max_iter=1000,).fit(X)

In [None]:
preds = dpgmm.predict(X)


## 2d PCA

In [None]:
pca = PCA(n_components=2)
pca_results = pca.fit(X).transform(X)
pca_df = pd.DataFrame(data = pca_results,
                           columns = ['principal component 1', 'principal component 2'])
pca_df['cluster'] = preds
final_df = pca_df
final_df.head()

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = [(i) for i in range(n_components)]
colors = ['r', 'g', 'b', 'm', 'c'][:n_components]
for target, color in zip(targets,colors):
    cluster_df = final_df[final_df['cluster'] == target]
    ax.scatter(cluster_df['principal component 1']
               , cluster_df['principal component 2']
               , c = color
               , s = 10)
ax.legend(targets)
ax.grid()

In [None]:
fig = go.Figure()
targets = [(i) for i in range(n_components)]
for target in targets:
    cluster_df = final_df[final_df['cluster'] == target]
    fig.add_trace(go.Scatter(x=cluster_df['principal component 1'],
                             y=cluster_df['principal component 2'],
                            mode='markers',
                            marker= dict(
                                size=3,
                            ),
                            name=target))

fig.update_layout({'title': '{} clusters'.format(n_components),
                  'xaxis': {
                      'title': 'Principal Component 1'
                  },
                  'yaxis': {
                      'title': 'Principal Component 2'
                  }})
fig.show()


## 3d PCA

In [None]:
pca = PCA(n_components=3)
pca_results = pca.fit(X).transform(X)
pca_df = pd.DataFrame(data = pca_results,
                           columns = ['principal component 1', 'principal component 2', 'principal component 3'])
pca_df['cluster'] = preds
final_df = pca_df
final_df.head()

In [None]:
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(111, projection='3d') 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_zlabel('Principal Component 3', fontsize = 15)
ax.set_title('3 component PCA', fontsize = 20)
targets = [(i) for i in range(n_components)]
colors = ['r', 'g', 'b', 'm', 'c'][:n_components]
for target, color in zip(targets,colors):
    cluster_df = final_df[final_df['cluster'] == target]
    ax.scatter(cluster_df['principal component 1']
               , cluster_df['principal component 2']
               , cluster_df['principal component 3']
               , c = color
               , s = 10)
ax.legend(targets)
ax.grid()

In [None]:
fig = go.Figure()
targets = [(i) for i in range(n_components)]
for target in targets:
    cluster_df = final_df[final_df['cluster'] == target]
    fig.add_trace(go.Scatter3d(x=cluster_df['principal component 1'],
                             y=cluster_df['principal component 2'],
                             z=cluster_df['principal component 3'],
                            mode='markers',
                            marker= dict(
                                size=3,
                            ),
                            name=target))
fig.show()


In [None]:
fig = go.Figure(data=[go.Histogram(x=preds)])
fig.show()

# 100 columns (discrete and continuous)

In [None]:
df_q = df.iloc[:, :100]
df_q_clean = df_q.dropna()
df_q_clean.head()

In [None]:
normalized_vectors = preprocessing.normalize(df_q_clean)

In [None]:
df_normalized = pd.DataFrame(normalized_vectors,columns=df_q_clean.columns)
df_normalized.head()

In [None]:
print(df_normalized.mean(axis=0))

## selecting cluster num

In [None]:
scores_k_means = [8.767975620642456e+19,
 7.825069704686853e+19,
 7.183911491842628e+19,
 6.922759430613714e+19,
 6.261210414145565e+19,
 5.800045813008041e+19,
 5.338880344802574e+19,
 4.877714293764805e+19,
 4.416552083506545e+19,
 3.955387343263997e+19,
 3.4942214703228273e+19,
 3.0330574130183025e+19,
 2.816580248817491e+19,
 2.6295770300398756e+19,
 2.494256613927725e+19,
 2.361025418523215e+19,
 2.180701321705099e+19,
 2.077457026632309e+19,
 1.9695557385714962e+19,
 1.845686524242552e+19,
 1.724399434796064e+19,
 1.6854644997394072e+19,
 1.5863294323545614e+19,
 1.518100893012325e+19,
 1.4347023208984635e+19,
 1.4129831721141086e+19,
 1.3291282299021515e+19,
 1.2907356465744026e+19,
 1.2575713154155952e+19,
 1.2163138938203034e+19,
 1.187590741095033e+19,
 1.1749391212295293e+19,
 1.1151383993056731e+19]

In [None]:
sns.lineplot(np.arange(2, 35), scores_k_means)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of k-Means versus number of clusters")

In [None]:
scores_cosine_means = [389727.99974420003,
 365203.50690628693,
 351080.3878482404,
 343842.0464398158,
 337621.6281215583,
 333099.26366572117,
 326199.1036186565,
 320031.27322683623,
 316564.4775094501,
 313715.15231793205,
 309463.3811214922,
 305735.6642873377,
 303607.8437839749,
 300634.7370109853,
 297542.8324055051,
 295206.1498766642,
 291620.6871182187,
 289190.5074452084,
 286329.7730713367,
 284118.74552086915,
 279963.4092019849,
 278896.5664801139,
 276641.8471794246,
 272812.20796448726,
 271472.0012652921,
 269389.26384386065,
 265851.11146884674,
 264277.6865159828,
 261412.70041942957,
 258657.2711889111,
 255990.40714312665,
 254161.21949834444,
 250635.1884468744]

In [None]:
sns.lineplot(np.arange(2, 35), scores_cosine_means)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of Cosine k-Means versus number of clusters")

## clustering

In [None]:
# X = np.array(df_q_clean.sample(frac=.001, random_state = 1))
X = np.array(df_normalized.sample(frac=.001, random_state = 1))
X.shape

In [None]:
n_components = 5
dpgmm = mixture.BayesianGaussianMixture(n_components,
                                        covariance_type='full',
                                        max_iter=1000,).fit(X)
preds = dpgmm.predict(X)


In [None]:
preds

### 2d PCA

In [None]:
pca = PCA(n_components=2)
pca_results = pca.fit(X).transform(X)
pca_df = pd.DataFrame(data = pca_results,
                           columns = ['principal component 1', 'principal component 2'])
pca_df['cluster'] = preds
final_df = pca_df
final_df.head()

In [None]:
fig = go.Figure()
targets = [(i) for i in range(n_components)]
for target in targets:
    cluster_df = final_df[final_df['cluster'] == target]
    fig.add_trace(go.Scatter(x=cluster_df['principal component 1'],
                             y=cluster_df['principal component 2'],
                            mode='markers',
                            marker= dict(
                                size=3,
                            ),
                            name=target))

fig.update_layout({'title': '{} clusters'.format(n_components),
                  'xaxis': {
                      'title': 'Principal Component 1'
                  },
                  'yaxis': {
                      'title': 'Principal Component 2'
                  }})
fig.show()


### 3d PCA

In [None]:
pca = PCA(n_components=3)
pca_results = pca.fit(X).transform(X)
pca_df = pd.DataFrame(data = pca_results,
                           columns = ['principal component 1', 'principal component 2', 'principal component 3'])
pca_df['cluster'] = preds
final_df = pca_df
final_df.head()

In [None]:
fig = go.Figure()
targets = [(i) for i in range(n_components)]
for target in targets:
    cluster_df = final_df[final_df['cluster'] == target]
    fig.add_trace(go.Scatter3d(x=cluster_df['principal component 1'],
                             y=cluster_df['principal component 2'],
                             z=cluster_df['principal component 3'],
                            mode='markers',
                            marker= dict(
                                size=3,
                            ),
                            name=target))
fig.show()


In [None]:
fig = go.Figure(data=[go.Histogram(x=preds)])
fig.show()