In [1]:
#readme
#This file uses upsampling
#this file splits into testing and training

In [2]:
#read in csv file
import csv
#vectorizes data.csv
from sklearn.feature_extraction.text import TfidfVectorizer
#used for dataframes
import pandas as pd
#used for splitting
from sklearn.model_selection import train_test_split
#used to perform logistic regression
from sklearn.linear_model import LogisticRegression
#used for logistic regression metrics
from sklearn.metrics import classification_report
from sklearn import model_selection
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score
#used for upsampling
from sklearn.utils import resample

# Data Frame Creation

In [3]:
#reads in 
df = pd.read_csv('UCSC Dataset 3 Final - Sheet1_clean.csv', encoding='utf-8')

# Creates Training and Testing sets

In [4]:
#Current implementation 
#splits and then runs tf-idf

#used http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
train, test = train_test_split(df, test_size = .1)
#split train set into 
print(len(train)) #180 training posts
print(len(test)) #20

#shows class distribution
print(train['VPN #'].value_counts())

#save max classifications to variable for all minority classes to be upsampled to
new_max = max(train['VPN #'].value_counts().astype(int))

180
20
3    124
2     21
0     15
4     14
1      6
Name: VPN #, dtype: int64


# Represents with TF-IDF

In [5]:
#creates the TF-IDF vectorizer up
#words have to have a minimum document frequency of 2 
#aka be in 2 documents to be included
tfidf = TfidfVectorizer(min_df = 2)

#uses vectorizer to perform fit_transform
train_tfs = tfidf.fit_transform(train['original_post'].values.astype('U'))

#transforms to fit the vocabulary of the training set
test_tfs = tfidf.transform(test['original_post'])
#creates a list of 20 values correlated to the test set
num_list_test = test['VPN #'].values

#Turn TF-IDF array to Pandas DataFrame
new_train= pd.DataFrame(train_tfs.toarray())
#Append new_train with classification values
new_train['VPN #'] = train['VPN #'].values

#shape should be (180, X)
print(new_train.shape)

(180, 1441)


# Upsampling 5+1

In [6]:
#up samples values to highest occurance
#set up currently for 4 classifications
#divides the dataframe into minority and majory dataframes
min_zero = new_train[new_train['VPN #'] == 0]
min_one = new_train[new_train['VPN #'] == 1]
min_two = new_train[new_train['VPN #'] == 2]
maj_three = new_train[new_train['VPN #'] == 3]
min_four = new_train[new_train['VPN #'] == 4]

#upsamples all but the majority
min_zero_up = resample(min_zero, replace=True, n_samples= new_max) 
min_one_up = resample(min_one, replace=True, n_samples= new_max)
min_two_up = resample(min_two, replace=True, n_samples= new_max) 
min_four_up = resample(min_four, replace=True, n_samples= new_max)

#concatinates all of the upsampled datasets with the majority set
new_train = pd.concat([maj_three, min_zero_up, min_one_up, min_two_up, min_four_up])

#prints the value counts
print(new_train['VPN #'].value_counts())
#create the training set classification values
num_list_train = new_train['VPN #'].values

#drops the classification values for training purposes
new_train = new_train.drop('VPN #', axis = 1)

#shape should be (new_max*5, X-1)
print(new_train.shape)

4    124
3    124
2    124
1    124
0    124
Name: VPN #, dtype: int64
(620, 1440)


# Implements Logistic Regression

In [7]:
#sets up instance of logistic regression
logistic = LogisticRegression(C = 1, solver = 'newton-cg', multi_class = 'multinomial', class_weight = 'balanced')
#feeds in (matrix, corresponding classificaiton value)
logistic.fit(new_train,num_list_train)

LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=1, penalty='l2',
          random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
          warm_start=False)

In [8]:
#Runs a single fold validation
#Result varies from run to run
y_pred_class = logistic.predict(test_tfs)
print("accuracy score:", accuracy_score(num_list_test, y_pred_class))

('accuracy score:', 0.69999999999999996)
