In [23]:
from __future__ import print_function
from IPython.display import display, HTML
import os
import sys
import csv
import time
import statistics
import numpy as np
import string
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from sklearn import svm
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import average_precision_score, accuracy_score, recall_score

In [24]:
#Read in the csv
#Get features columns and convert it to numpy array
chinDataDf = pd.read_csv("data_with_features/chin_cleaned_data_f.csv")
chinFeaturesDf = chinDataDf.drop(['depressed', chinDataDf.columns[0],'text'], axis=1)
chinFeaturesNpArray = chinFeaturesDf.as_matrix()
chinFeaturesNpArray[np.isnan(chinFeaturesNpArray)] = 0.0
#Get label columns and convert it to numpy array
chinLabelsDf = chinDataDf[['depressed']]
chinLabelsNpArray = chinLabelsDf.as_matrix()
#convert all -1 to 0
chinLabelsNpArray[chinLabelsNpArray==-1] = 0
print("Chin data shape")
print(chinDataDf.shape)
print(chinFeaturesDf.shape)
print(chinLabelsDf.shape)
engDataDf = pd.read_csv("data_with_features/eng_cleaned_data_f.csv")
engFeaturesDf = engDataDf.drop(['depressed', engDataDf.columns[0],'text'], axis=1)
engFeaturesNpArray = engFeaturesDf.as_matrix()
engFeaturesNpArray[np.isnan(engFeaturesNpArray)] = 0.0
#Get label columns and convert it to numpy array
engLabelsDf = engDataDf[['depressed']]
engLabelsNpArray = engLabelsDf.as_matrix()
#convert all -1 to 0
engLabelsNpArray[engLabelsNpArray==-1] = 0
print("Eng data shape")
print(engDataDf.shape)
print(engFeaturesDf.shape)
print(engLabelsDf.shape)

Chin data shape
(804, 79)
(804, 76)
(804, 1)
Eng data shape
(3570, 79)
(3570, 76)
(3570, 1)


In [25]:
#Set parameters for grid search 
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [0.1, 1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]}]
scores = ['accuracy', 'precision', 'recall']

### SVM baseline for Chinese data

In [26]:
#Train:Test ratio = 80%:20%
#Split between training set and testing set, perform k-fold cross validation on testing set after training
chinXTrain, chinXTest, chinYTrain, chinYTest = train_test_split(chinFeaturesNpArray, chinLabelsNpArray, test_size=0.2, shuffle=True, random_state=228, stratify=chinLabelsNpArray)
r1, c1 = chinYTrain.shape
r2, c2 = chinYTest.shape
chinYTrain = chinYTrain.reshape(r1,)
chinYTest = chinYTest.reshape(r2,)

In [27]:
for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring=score)
    clf.fit(chinXTrain, chinYTrain)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, meanScore, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (meanScore, scores.std() / 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    yTrue, yPred = chinYTest, clf.best_estimator_.predict(chinXTest)
    svm_precision_score = average_precision_score(yTrue, yPred)
    svm_recall_score = recall_score(yTrue, yPred, average='binary')
    print('Average precision-recall score: {0:0.2f}'.format(svm_precision_score))
    print('Recall score: {0:0.2f}'.format(svm_recall_score))
    print('Accuracy: ' + str(accuracy_score(yTrue, yPred)))
    print(classification_report(yTrue, yPred))
    print()

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Grid scores on development set:

0.659 (+/-0.001) for {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.001}
0.659 (+/-0.001) for {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.0001}
0.670 (+/-0.010) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.661 (+/-0.003) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.664 (+/-0.011) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.664 (+/-0.006) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.658 (+/-0.005) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.669 (+/-0.008) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.714 (+/-0.017) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.664 (+/-0.009) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}
0

### SVM baseline for English data

In [31]:
#Set parameters for grid search 
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                     'C': [0.1, 1, 10, 100, 1000]},
                    {'kernel': ['linear'], 'C': [0.1, 1, 10, 100, 1000]}]
engScores = ['accuracy', 'precision', 'recall']

In [32]:
#Train:Test ratio = 80%:20%
#Split between training set and testing set, perform k-fold cross validation on testing set after training
engXTrain, engXTest, engYTrain, engYTest = train_test_split(engFeaturesNpArray, engLabelsNpArray, test_size=0.2, shuffle=True, random_state=228, stratify=engLabelsNpArray)
r1, c1 = engYTrain.shape
r2, c2 = engYTest.shape
engYTrain = engYTrain.reshape(r1,)
engYTest = engYTest.reshape(r2,)

In [33]:
for score in engScores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(svm.SVC(C=1), tuned_parameters, cv=5, scoring=score)
    clf.fit(engXTrain, engYTrain)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_estimator_)
    print()
    print("Grid scores on development set:")
    print()
    for params, meanScore, scores in clf.grid_scores_:
        print("%0.3f (+/-%0.03f) for %r"
              % (meanScore, scores.std() / 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    yTrue, yPred = engYTest, clf.best_estimator_.predict(engXTest)
    svm_precision_score = average_precision_score(yTrue, yPred)
    svm_recall_score = recall_score(yTrue, yPred, average='binary')
    print('Accuracy: ' + str(accuracy_score(yTrue, yPred)))
    print('Average precision-recall score: {0:0.2f}'.format(svm_precision_score))
    print('Recall score: {0:0.2f}'.format(svm_recall_score))
    #yTrue, yPred = engYTest, clf.predict(engXTest)
    #print(classification_report(yTrue, yPred))
    print()

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

SVC(C=1000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Grid scores on development set:

0.529 (+/-0.000) for {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.001}
0.529 (+/-0.000) for {'kernel': 'rbf', 'C': 0.1, 'gamma': 0.0001}
0.529 (+/-0.000) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.001}
0.529 (+/-0.000) for {'kernel': 'rbf', 'C': 1, 'gamma': 0.0001}
0.707 (+/-0.009) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.001}
0.529 (+/-0.000) for {'kernel': 'rbf', 'C': 10, 'gamma': 0.0001}
0.800 (+/-0.009) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.001}
0.707 (+/-0.009) for {'kernel': 'rbf', 'C': 100, 'gamma': 0.0001}
0.812 (+/-0.011) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.001}
0.800 (+/-0.009) for {'kernel': 'rbf', 'C': 1000, 'gamma': 0.0001}