In [44]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#Import the data and make it a dataframe
NPI_Data = pd.read_csv('data.csv')
NPI_Data = pd.DataFrame(NPI_Data)

#Turn the data into a correlation matrix
NPI_Correlation = NPI_Data.corr().abs()

# Select upper triangle of correlation matrix
upper = NPI_Correlation.where(np.triu(np.ones(NPI_Correlation.shape), k=1).astype(bool))

# Find features with correlation greater than x
x = 0.55
to_drop = [column for column in upper.columns if any(upper[column] > x)]
print(to_drop)

# Drop features 
NPI_Data.drop(to_drop, axis=1, inplace=True)
print(NPI_Data)

['Q27', 'Q30', 'Q32', 'Q35', 'Q36', 'Q40']
       score  Q1  Q2  Q3  Q4  Q5  Q6  Q7  Q8  Q9  ...  Q29  Q31  Q33  Q34  \
0         18   2   2   2   2   1   2   1   2   2  ...    2    1    1    1   
1          6   2   2   2   1   2   2   1   2   1  ...    2    2    2    2   
2         27   1   2   2   1   2   1   2   1   2  ...    2    2    1    1   
3         29   1   1   2   2   2   1   2   1   1  ...    2    1    1    1   
4          6   1   2   1   1   1   2   1   2   1  ...    2    2    2    2   
...      ...  ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...  ...  ...  ...   
11238      1   2   2   2   1   1   2   1   2   1  ...    2    2    2    2   
11239     10   2   2   1   1   1   2   1   1   1  ...    1    2    2    2   
11240      6   1   2   2   1   1   2   1   2   1  ...    1    1    2    2   
11241     12   2   2   1   1   1   1   1   1   1  ...    1    2    1    1   
11242     18   1   2   1   1   1   1   2   2   1  ...    2    1    1    2   

       Q37  Q38  Q39  elapse  ge

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier as RFC
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
import seaborn as sn

x_socio = NPI_Data.iloc[:, :]
y_socio = NPI_Data.iloc[:, 0]
x_train_soc, x_test_soc, y_train_soc, y_test_soc = train_test_split(
     x_socio, y_socio, test_size=0.45, random_state=0)
corr_matrix = x_socio.corr()
sn.heatmap(corr_matrix, annot=True, annot_kws={"size": 3})
plt.savefig('corr_matrix.png', dpi=300)
#print(NPI_Data.groupby('cluster').mean())

from sklearn import tree
clf_dt_soc = tree.DecisionTreeClassifier(max_depth = 4, criterion='entropy', random_state=42)
clf_dt_soc.fit(x_train_soc, y_train_soc)
y_dt_pred_soc = clf_dt_soc.predict(x_test_soc)
train_acc_soc = accuracy_score(y_train_soc, clf_dt_soc.predict(x_train_soc))
test_acc_soc = accuracy_score(y_test_soc, y_dt_pred_soc)

print('DT: Psych. train acc: ', round(train_acc_soc*100, 4))
print('DT: Psych. test acc: ', round(test_acc_soc*100, 4))

rf_clf_soc = RFC(random_state=0)
rf_clf_soc = rf_clf_soc.fit(x_train_soc, y_train_soc)
y_rfc_pred_soc = rf_clf_soc.predict(x_test_soc)
rfc_train_acc_soc = accuracy_score(y_train_soc, rf_clf_soc.predict(x_train_soc))
rfc_test_acc_soc = accuracy_score(y_test_soc, y_rfc_pred_soc)
print('RFC: Psych train acc: ', round(rfc_train_acc_soc, 4))
print('RFC: Psych test acc: ', round(rfc_test_acc_soc, 4))

from sklearn.ensemble import GradientBoostingClassifier as GBC
gbt_clf_soc = GBC(n_estimators=100, learning_rate=1.0, random_state=0).fit(x_train_soc, y_train_soc)
y_gbt_pred_soc = gbt_clf_soc.predict(x_test_soc)
gbt_train_acc_soc = accuracy_score(y_train_soc, gbt_clf_soc.predict(x_train_soc))
gbt_test_acc_soc = accuracy_score(y_test_soc, y_gbt_pred_soc)
print('RFC: Psych train acc: ', round(gbt_train_acc_soc, 4))
print('GBT: Psych test acc: ', round(gbt_test_acc_soc, 4))
#cluster1 = sociopath[sociopath.cluster=='cluster1']
#cluster2 = sociopath[sociopath.cluster=='cluster2']
#cluster3 = sociopath[sociopath.cluster=='cluster3']

import graphviz
dot_data = tree.export_graphviz(clf_dt_soc, feature_names=x_socio.columns, class_names=clf_dt_soc.classes_, out_file=None, filled=True)
graph = graphviz.Source(dot_data, format="png")
graph.render("dt_psych_stuff")
ftn = np.array(x_socio.columns)
text_representation = tree.export_text(clf_dt_soc)
print(text_representation)

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x_socio)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])
finalDf = pd.concat([principalDf, y_socio], axis = 1)
fig = plt.figure(figsize = (16,16))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
targets = ['cluster1', 'cluster2', 'cluster3', 'cluster4']
colors = ['r', 'g', 'b', 'y']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['cluster'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , alpha = 0.3
               , s = 50)
ax.legend(targets)
ax.grid()
plt.savefig('cluster_pca.png', dpi=300)
pca.components_

DT: Psych. train acc:  60.8766
DT: Psych. test acc:  60.6324
RFC: Psych train acc:  1.0
RFC: Psych test acc:  0.814
