In [1]:
#readme
#this file splits into testing and training

In [2]:
#read in csv file
import csv
#vectorizes data.csv
from sklearn.feature_extraction.text import TfidfVectorizer
#used for dataframes
import pandas as pd
#used for splitting
from sklearn.model_selection import train_test_split
#used to perform logistic regression
from sklearn.linear_model import LogisticRegression
#used for logistic regression metrics
from sklearn.metrics import classification_report
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict

# Data Frame Creation

In [3]:
#reads in 
df = pd.read_csv('UCSC Dataset 3 Final - Sheet1_clean.csv', encoding='utf-8')

# Creates Training and Testing sets

## Represents with TF-IDF

In [4]:
#Current implementation 
#splits and then runs tf-idf

#used http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
#split train set into 
train, test = train_test_split(df, test_size = .1)


tfidf = TfidfVectorizer(min_df = 2)
#uses vectorizer to perform fit_transform
train_tfs = tfidf.fit_transform(train['original_post'].values.astype('U'))
#transforms to fit the vocabulary of the training set
test_tfs = tfidf.transform(test['original_post'])
whole_tfs = tfidf.fit_transform(df['original_post'])

#creates a list of 180 values correlated to the training set
num_list_train = train['VPN #'].values
#creates a list of 20 values correlated to the test set
num_list_test = test['VPN #'].values
num_list_whole = df['VPN #'].values

In [5]:
print(df['VPN #'].value_counts())
print(train_tfs.shape)

3    136
2     25
0     17
4     16
1      6
Name: VPN #, dtype: int64
(180, 1468)


# Implements Logistic Regression

In [6]:
#sets up instance of logistic regression
logistic = LogisticRegression(C=1, solver = 'newton-cg', multi_class = 'multinomial', class_weight = 'balanced')
#feeds in (matrix, corresponding classificaiton value)
logistic.fit(train_tfs,num_list_train)

LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=1, penalty='l2',
          random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
          warm_start=False)

# Stratified K-Folds

In [7]:
def metrics():
    """
    STRATIFIED
    Prints out metrics for performace of: 
        Logistic Regression Classification Accuracy
        Logistic Regression Metrics
        
    taken from here
    https://machinelearningmastery.com/metrics-evaluate-machine-learning-algorithms-python/
    """
    
    #stratified cross validation
    skf = StratifiedKFold(n_splits=6)
    skf.get_n_splits(whole_tfs,num_list_whole)
    print(skf)
    

    print("Logistic Regression Classification Accuracy:")

    #accuracy
    results = model_selection.cross_val_score(logistic, whole_tfs,num_list_whole, cv=skf, scoring='accuracy')
    print("Accuracy: %.3f (%.3f)\n") % (results.mean(), results.std())

    #confusion matrix
    y_pred = cross_val_predict(logistic,whole_tfs,num_list_whole,cv=skf)
    conf_mat = confusion_matrix(num_list_whole ,y_pred)
    print("Confusion Matrix:\n%s\n"%conf_mat)

    #classification report
    y_pred = cross_val_predict(logistic,whole_tfs,num_list_whole,cv=skf)
    report = classification_report(num_list_whole, y_pred)
    print("Classification Report:\n%s\n"%report)

    print("Logistic Regression Metrics:\nmetric: mean (standard deviation)\n")
    #MAE (Mean Absolute Error)
    results = model_selection.cross_val_score(logistic, whole_tfs,num_list_whole, cv=skf, scoring='neg_mean_absolute_error')
    print("MAE (Mean Absolute Error): %.3f (%.3f)") % (results.mean(), results.std())

    #MSE (Mean Squared Error)
    results = model_selection.cross_val_score(logistic, whole_tfs,num_list_whole, cv=skf, scoring='neg_mean_squared_error')
    print("MSE (Mean Squared Error): %.3f (%.3f)") % (results.mean(), results.std())

    #R^2
    results = model_selection.cross_val_score(logistic, whole_tfs,num_list_whole, cv=skf, scoring='r2')
    print("R^2: %.3f (%.3f)\n") % (results.mean(), results.std())
    
   

In [8]:
#stratified
metrics()

StratifiedKFold(n_splits=6, random_state=None, shuffle=False)
SVM Classification Accuracy:
Accuracy: 0.696 (0.030)

Confusion Matrix:
[[  2   0   4  11   0]
 [  1   0   1   4   0]
 [  3   0  11  10   1]
 [  6   2   4 122   2]
 [  0   0   2  10   4]]

Classification Report:
             precision    recall  f1-score   support

          0       0.17      0.12      0.14        17
          1       0.00      0.00      0.00         6
          2       0.50      0.44      0.47        25
          3       0.78      0.90      0.83       136
          4       0.57      0.25      0.35        16

avg / total       0.65      0.69      0.66       200


Logistic Regression Metrics:
metric: mean (standard deviation)

MAE (Mean Absolute Error): -0.555 (0.062)
MSE (Mean Squared Error): -1.226 (0.208)
R^2: -0.279 (0.181)

