#### Read in data from Github and set up for models

In [1]:
#Read in the data
import pandas as pd

df1=pd.read_csv("https://raw.githubusercontent.com/statzenthusiast921/political-emails-analysis/main/main/data/clean_emails_df1.csv")
df2=pd.read_csv("https://raw.githubusercontent.com/statzenthusiast921/political-emails-analysis/main/main/data/clean_emails_df2.csv")
df3=pd.read_csv("https://raw.githubusercontent.com/statzenthusiast921/political-emails-analysis/main/main/data/clean_emails_df3.csv")
df4=pd.read_csv("https://raw.githubusercontent.com/statzenthusiast921/political-emails-analysis/main/main/data/clean_emails_df4.csv")

print(df1.shape)
print(df2.shape)
print(df3.shape)
print(df4.shape)

(19000, 20)
(19000, 20)
(19000, 20)
(17148, 20)


In [2]:
#Stack 4 dataframes together
df = pd.concat([df1, df2, df3, df4], ignore_index=True, axis=0)
#Delete first row and column
df = df[1:]
df = df.iloc[: , 1:]
#Preview dataframe
df.head(2)

Unnamed: 0,subject,date,body,party,country,locality,office,time,AM_PM,Hour,Hour_Mil,month,day,year,month_num,cleaned_body,sentiment,compound,comp_score
1,TAKE ACTION for Freedom: #CloseTheCamps,"July 3, 2019","Dear friend, We have all seen the images and r...",,United States,,,11:31 PM,PM,11,23,July,3,2019,7,dear friend seen images read stories migrants ...,"{'neg': 0.207, 'neu': 0.674, 'pos': 0.12, 'com...",-0.9807,0
2,trauma-informed schools,"July 3, 2019","Team,Access to education is fundamental to a c...",Democratic,United States,Ohio,President of the United States,11:32 PM,PM,11,23,July,3,2019,7,team access education fundamental child succes...,"{'neg': 0.143, 'neu': 0.658, 'pos': 0.199, 'co...",0.7269,1


In [3]:
df['comp_score'].value_counts()

1    61281
0    12866
Name: comp_score, dtype: int64

In [4]:
from sklearn.model_selection import train_test_split

y = df['comp_score']
X = df['cleaned_body'].astype('U').values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(51902,)
(51902,)
(22245,)
(22245,)


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(X)


X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

print('Train Data:',X_train.shape)
print('Test Data:',X_test.shape)

Train Data: (51902, 302765)
Test Data: (22245, 302765)


#### Oversampling to correct for class imbalance

In [33]:
print("Before OverSampling, counts of label 'Positive': {}".format(sum(y_train == 1))) 
print("Before OverSampling, counts of label 'Negative': {} \n".format(sum(y_train == 0))) 

Before OverSampling, counts of label 'Positive': 42907
Before OverSampling, counts of label 'Negative': 8995 



In [9]:
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state = 2) 
X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel()) 
  
print('After OverSampling, the shape of X_train: {}'.format(X_train_res.shape)) 
print('After OverSampling, the shape of y_train: {} \n'.format(y_train_res.shape)) 
  
print("After OverSampling, counts of label 'Positive': {}".format(sum(y_train_res == 1))) 
print("After OverSampling, counts of label 'Negative': {}".format(sum(y_train_res == 0))) 

After OverSampling, the shape of X_train: (85814, 302765)
After OverSampling, the shape of y_train: (85814,) 

After OverSampling, counts of label 'Positive': 42907
After OverSampling, counts of label 'Negative': 42907


In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, f1_score

#Fit the model on oversampled data
lr1 = LogisticRegression(max_iter=1000) 
lr1.fit(X_train_res, y_train_res.ravel()) 
predictions = lr1.predict(X_test) 
  
#Print classification report 
print(classification_report(y_test, predictions,target_names=['Negative','Positive']))

              precision    recall  f1-score   support

    Negative       0.75      0.82      0.78      3871
    Positive       0.96      0.94      0.95     18374

    accuracy                           0.92     22245
   macro avg       0.86      0.88      0.87     22245
weighted avg       0.92      0.92      0.92     22245



In [18]:
y_test.value_counts()

1    18374
0     3871
Name: comp_score, dtype: int64

In [25]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predictions)
print(cm)

[[ 3162   709]
 [ 1030 17344]]


#### Undersampling to correct for class imbalance 

In [29]:
print("Before Undersampling, counts of label 'Positive': {}".format(sum(y_train == 1))) 
print("Before Undersampling, counts of label 'Negative': {} \n".format(sum(y_train == 0))) 
  
# apply near miss 
from imblearn.under_sampling import NearMiss 
nr = NearMiss() 
  
X_train_miss, y_train_miss = nr.fit_resample(X_train, y_train.ravel()) 
  
print('After Undersampling, the shape of X_train: {}'.format(X_train_miss.shape)) 
print('After Undersampling, the shape of y_train: {} \n'.format(y_train_miss.shape)) 
  
print("After Undersampling, counts of label 'Positive': {}".format(sum(y_train_miss == 1))) 
print("After Undersampling, counts of label 'Negative': {}".format(sum(y_train_miss == 0))) 

Before Undersampling, counts of label 'Positive': 42907
Before Undersampling, counts of label 'Negative': 8995 

After Undersampling, the shape of X_train: (17990, 302765)
After Undersampling, the shape of y_train: (17990,) 

After Undersampling, counts of label 'Positive': 8995
After Undersampling, counts of label 'Negative': 8995


In [31]:
# train the model on train set 
lr2 = LogisticRegression(max_iter=1000) 
lr2.fit(X_train_miss, y_train_miss.ravel()) 
predictions = lr2.predict(X_test) 
  
# print classification report 
print(classification_report(y_test, predictions,target_names=['Negative','Positive']))

              precision    recall  f1-score   support

    Negative       0.40      0.94      0.56      3871
    Positive       0.98      0.71      0.82     18374

    accuracy                           0.75     22245
   macro avg       0.69      0.82      0.69     22245
weighted avg       0.88      0.75      0.78     22245



In [32]:
cm = confusion_matrix(y_test, predictions)
print(cm)

[[ 3622   249]
 [ 5404 12970]]
