In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
#base dataset
data = pd.read_csv("data-final.csv")
#remove entries at IP addresses from which multiple responses were sent
data = data[data['IPC'] == 1].dropna()
#remove country-less entries
data = data[data['country'] != 'NONE']
#responses only
R = data.loc[:,'EXT1':'OPN10']
R = R.astype('int')
for col in R.columns:
    R = R[R[col] != 0]
    data = data[data[col] != 0]
#segments
EXT = R.loc[:,'EXT1':'EXT10']
EST = R.loc[:,'EST1':'EST10']
AGR = R.loc[:,'AGR1':'AGR10']
CSN = R.loc[:,'CSN1':'CSN10']
OPN = R.loc[:,'OPN1':'OPN10']
print(R.shape)

In [None]:
### PRELIM INFORMATION GAIN CALCULATION
### this takes a very long time to run
for target in ['EXT7','EST1','AGR8','CSN3','OPN7']:#R.columns
    print(f"target = {target}")

    #compute entropy of the target
    vals = R[target].to_numpy()
    counts = [0,0,0,0,0]
    for i in range(len(vals)):
        counts[vals[i]-1] += 1
    total = sum(counts)
    entropy = -1 * sum([(x/total)*np.log2(x/total) for x in counts])
    
    print(f'information gains:')
    #compute information gain from each of the remaining attributes
    for col in R.columns:
        if col != target:
            
            #get counts of each value of the attribute(col)
            col_vals = R[col].to_numpy()
            col_counts = [0,0,0,0,0]

            for i in range(len(col_vals)):
                col_counts[col_vals[i]-1] += 1
            col_total = sum(col_counts)

            #compute entropy of the target given values of col
            sub_entropies = [0,0,0,0,0]
            for i in range(1,6):
                #count values of the target given a particular value of col
                sub_counts = [0,0,0,0,0]
                for j in range(len(vals)):
                    if col_vals[j] == i:
                        sub_counts[vals[j]-1]+= 1

                sub_total = sum(sub_counts)
                #compute entropy of the target given a particular value of col
                sub_entropies[i-1] = -1*sum([(x/sub_total)*np.log2(x/sub_total) for x in sub_counts])
            #entropy for the attribute col is then the weighted average of sub entropies
            col_entropy = sum([(col_counts[i]/col_total)*sub_entropies[i] for i in range(len(col_counts))])
            
            #then the information gain is the entropy of the target less the entropy of col
            print(f'{col} : {entropy-col_entropy}')

In [None]:
#test decision tree accuracy for chosen targets
for target in ['EXT7','EST1','AGR8','CSN3','OPN7']:
    print(f"predicting {target}")
    #separate predictors and target
    preds = R.copy()
    targ = preds.loc[:,target]
    preds = preds.drop(columns=target)
    X,y = preds.to_numpy(),targ.to_numpy()
    for i in range(len(y)):
        y[i] = y[i]-1
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.15,random_state=0)

    #fit decision trees with various depths
    for depth in [5,6,7,8,9,10,12,14,15]:
        clf = tree.DecisionTreeClassifier(criterion='entropy',max_depth=depth,random_state=0)
        clf.fit(X_train,y_train)
        print(f"depth = {depth}: test accuracy = {clf.score(X_test,y_test)}")

In [None]:
#results paste
# predicting EXT7
# depth = 5: test accuracy = 0.4827079664957731
# depth = 6: test accuracy = 0.4864339011411283
# depth = 7: test accuracy = 0.49346745400951425
# depth = 8: test accuracy = 0.4999951358555544
# depth = 9: test accuracy = 0.502242370589437
# depth = 10: test accuracy = 0.5034875915675192
# depth = 12: test accuracy = 0.501512748922592
# depth = 14: test accuracy = 0.4890216259862053
# depth = 15: test accuracy = 0.48021752453960875
# predicting EST1
# depth = 5: test accuracy = 0.4728142966933546
# depth = 6: test accuracy = 0.4861906939188466
# depth = 7: test accuracy = 0.4904808693198953
# depth = 8: test accuracy = 0.49447919605420604
# depth = 9: test accuracy = 0.4962205597657428
# depth = 10: test accuracy = 0.49686262683256643
# depth = 12: test accuracy = 0.49153152452015214
# depth = 14: test accuracy = 0.4783983345169418
# depth = 15: test accuracy = 0.4679306956699386
# predicting AGR8
# depth = 5: test accuracy = 0.48395318747385524
# depth = 6: test accuracy = 0.49495588220987813
# depth = 7: test accuracy = 0.4999367661222068
# depth = 8: test accuracy = 0.5026509587228702
# depth = 9: test accuracy = 0.5045285184788848
# depth = 10: test accuracy = 0.5053262381679686
# depth = 12: test accuracy = 0.502203457433872
# depth = 14: test accuracy = 0.4914634264979133
# depth = 15: test accuracy = 0.48335003356259665
# predicting CSN3
# depth = 5: test accuracy = 0.4909672837644587
# depth = 6: test accuracy = 0.49398305332075143
# depth = 7: test accuracy = 0.5003745391223138
# depth = 8: test accuracy = 0.5033124823674764
# depth = 9: test accuracy = 0.5070578735906142
# depth = 10: test accuracy = 0.5096358701467999
# depth = 12: test accuracy = 0.5049954763456656
# depth = 14: test accuracy = 0.4928448435204732
# depth = 15: test accuracy = 0.4842255795628107
# predicting OPN7
# depth = 5: test accuracy = 0.5293356551516154
# depth = 6: test accuracy = 0.5380132888426254
# depth = 7: test accuracy = 0.5418462346657846
# depth = 8: test accuracy = 0.5440253713774285
# depth = 9: test accuracy = 0.5466520093780705
# depth = 10: test accuracy = 0.5477123928672186
# depth = 12: test accuracy = 0.545465158133336
# depth = 14: test accuracy = 0.5359606198865682
# depth = 15: test accuracy = 0.5289076104403997