In [74]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import coo_matrix, hstack
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm
import shap

In [82]:
df = pd.read_csv("combined_with_split_tags.csv", index_col = 0)

In [86]:
df = df.drop(columns = ['count'])

In [87]:
df.columns

Index(['Cleaned_Text', 'Length', 'Emoticons_count', 'Emoticons Avg',
       'Unique_Words', 'TTR', 'anger', 'anticipation', 'disgust', 'fear',
       'joy', 'negative', 'positive', 'sadness', 'surprise', 'trust',
       'Afinn Score', 'Polarity', 'Subjectivity', 'num_noun', 'num_adj',
       'num_prep', 'num_det', 'num_pron', 'num_verb', 'num_adverb',
       'num_interject', 'lowercase', 'uppercase', 'uppercase_num',
       'proper cap', 'contractions_num', 'emotionalpunctuations_num',
       'readable_num', 'hedge_perc', 'firstperson_perc', 'thirdperson_perc',
       'Kincaid', 'ARI', 'Coleman-Liau', 'FleschReadingEase',
       'GunningFogIndex', 'LIX', 'SMOGIndex', 'RIX', 'DaleChallIndex',
       'characters_per_word', 'syll_per_word', 'words_per_sentence',
       'sentences_per_paragraph', 'type_token_ratio', 'characters',
       'syllables', 'words', 'wordtypes', 'sentences', 'paragraphs',
       'long_words', 'complex_words', 'complex_words_dc', 'tobeverb',
       'auxverb', 'conj

In [88]:
X = df.iloc[:,:-4]
y_targets = df.iloc[:,-4:]

In [89]:
X_train, X_test, y_train_targets, y_test_targets = train_test_split(X, y_targets,random_state=1, test_size = 0.2)

In [90]:
vect = CountVectorizer(max_features = 5000)
X_train_text = X_train['Cleaned_Text']
X_train_dtm = vect.fit_transform(X_train_text)
X_train_dtm.shape

(10260, 5000)

In [91]:
X_train_features = coo_matrix(X_train.iloc[:,1:])
X_train_features.shape

(10260, 71)

In [92]:
X_train_dtm_features = hstack((X_train_dtm,X_train_features))
X_train_dtm_features.shape

(10260, 5071)

In [93]:
X_test_dtm = vect.transform(X_test['Cleaned_Text'])
X_test_dtm.shape

(2566, 5000)

In [94]:
X_test_features = coo_matrix(X_test.iloc[:,1:])
X_test_features.shape

(2566, 71)

In [95]:
X_test_dtm_features = hstack((X_test_dtm,X_test_features))
X_test_dtm_features.shape

(2566, 5071)

# E_I

In [99]:
svm_clf = Pipeline([('standardscaler', StandardScaler(with_mean=False)), ('clf', svm.SVC()),])

Cs = [0.01, 0.1, 1, 10]
gammas = [0.01, 0.1, 1]
parameters = {'clf__gamma':gammas, 'clf__C':Cs}

y_train = y_train_targets['E_I']
y_test = y_test_targets['E_I']

#model
svm_clf_1 = svm_clf.fit(X_train_dtm_features, y_train)
gs_svm_clf_1 = GridSearchCV(svm_clf_1, parameters, cv = 3, n_jobs=-1, verbose = 2)
gs_svm_clf_1 = gs_svm_clf_1.fit(X_train_dtm_features, y_train)
y_pred_1 = gs_svm_clf_1.predict(X_test_dtm_features)
acc_1 = accuracy_score(y_test, y_pred_1)
    
print(f'E_I\nBest score: {gs_svm_clf_1.best_score_}\nBest params:{gs_svm_clf_1.best_params_}\nAccuracy: {acc_1}')

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 64.6min finished


E_I
Best score: 0.7140350877192981
Best params:{'clf__C': 10, 'clf__gamma': 0.01}
Accuracy: 0.720966484801247


# N_S

In [100]:
svm_clf = Pipeline([('standardscaler', StandardScaler(with_mean=False)), ('clf', svm.SVC()),])

Cs = [0.01, 0.1, 1, 10]
gammas = [0.01, 0.1, 1]
parameters = {'clf__gamma':gammas, 'clf__C':Cs}

y_train = y_train_targets['N_S']
y_test = y_test_targets['N_S']

#model
svm_clf_2 = svm_clf.fit(X_train_dtm_features, y_train)
gs_svm_clf_2 = GridSearchCV(svm_clf_2, parameters, cv = 3, n_jobs=-1, verbose = 2)
gs_svm_clf_2 = gs_svm_clf_2.fit(X_train_dtm_features, y_train)
y_pred_2 = gs_svm_clf_2.predict(X_test_dtm_features)
acc_2 = accuracy_score(y_test, y_pred_2)
    
print(f'N_S\nBest score: {gs_svm_clf_2.best_score_}\nBest params:{gs_svm_clf_2.best_params_}\nAccuracy: {acc_2}')

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 48.5min finished


N_S
Best score: 0.8205653021442495
Best params:{'clf__C': 0.01, 'clf__gamma': 0.01}
Accuracy: 0.8164458300857366


# F_T

In [102]:
svm_clf = Pipeline([('standardscaler', StandardScaler(with_mean=False)), ('clf', svm.SVC()),])

Cs = [0.01, 0.1, 1, 10]
gammas = [0.01, 0.1, 1]
parameters = {'clf__gamma':gammas, 'clf__C':Cs}

y_train = y_train_targets['F_T']
y_test = y_test_targets['F_T']

#model
svm_clf_3 = svm_clf.fit(X_train_dtm_features, y_train)
gs_svm_clf_3 = GridSearchCV(svm_clf_3, parameters, cv = 3, n_jobs=-1, verbose = 2)
gs_svm_clf_3 = gs_svm_clf_3.fit(X_train_dtm_features, y_train)
y_pred_3 = gs_svm_clf_3.predict(X_test_dtm_features)
acc_3 = accuracy_score(y_test, y_pred_3)
    
print(f'F_T\nBest score: {gs_svm_clf_3.best_score_}\nBest params:{gs_svm_clf_3.best_params_}\nAccuracy: {acc_3}')

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 42.6min finished


F_T
Best score: 0.503411306042885
Best params:{'clf__C': 1, 'clf__gamma': 0.01}
Accuracy: 0.5128604832424006


# J_P

In [103]:
svm_clf = Pipeline([('standardscaler', StandardScaler(with_mean=False)), ('clf', svm.SVC()),])

Cs = [0.01, 0.1, 1, 10]
gammas = [0.01, 0.1, 1]
parameters = {'clf__gamma':gammas, 'clf__C':Cs}

y_train = y_train_targets['J_P']
y_test = y_test_targets['J_P']

#model
svm_clf_4 = svm_clf.fit(X_train_dtm_features, y_train)
gs_svm_clf_4 = GridSearchCV(svm_clf_4, parameters, cv = 3, n_jobs=-1, verbose = 2)
gs_svm_clf_4 = gs_svm_clf_4.fit(X_train_dtm_features, y_train)
y_pred_4 = gs_svm_clf_4.predict(X_test_dtm_features)
acc_4 = accuracy_score(y_test, y_pred_4)
    
print(f'J_P\nBest score: {gs_svm_clf_4.best_score_}\nBest params:{gs_svm_clf_4.best_params_}\nAccuracy: {acc_4}')

Fitting 3 folds for each of 12 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed: 42.6min finished


J_P
Best score: 0.5846003898635478
Best params:{'clf__C': 1, 'clf__gamma': 0.01}
Accuracy: 0.5810600155884645
