In [37]:
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.metrics import confusion_matrix

In [2]:
def get_mean_scores_by_date(data, topic, year):
    path = 'data/processed-sent/'+topic+'/'+year+'/hdsentiment.csv'
    df = pd.read_csv(path)
    df2 = df.groupby('dates')['scores'].mean().to_frame().reset_index()
    second_col = topic + '_scores'
    df2.columns = ['dates', second_col]
    
    if len(data) == 0:
        return df2
    
    df3 = pd.merge(data, df2, how='outer', on=['dates'])
    return df3

In [3]:
def get_win_by_year(year):
    if year == '2016' or year == '2004' or year == '2000':
        return 1
    else:
        return 0
        

In [4]:
def combine_data_by_year():
    path = 'data/processed-sent-merge/'

    for y in ['2016', '2012', '2008', '2004', '2000', '2020']:
        data = []
        for t in ['candidates', 'health', 'immigration', 'jobs-race', 'environment', 'guns', 'party', 'economy']:
            data = get_mean_scores_by_date(data, t, y)

        if y != '2020':
            data['winner'] = get_win_by_year(y)
            
        data.to_csv(path + y + '/combinedtopics_sent.csv')

In [5]:
combine_data_by_year()

In [6]:
def combine_all_horizontally(test_year):
    path = 'data/processed-sent-merge/'
    train_data = []
    test_data = []
    
    for y in ['2016', '2012', '2008', '2004', '2000']:
        if test_year == y:
            test_data = pd.read_csv(path + y + '/combinedtopics_sent.csv')
            continue
            
        if len(train_data) == 0:
            train_data = pd.read_csv(path + y + '/combinedtopics_sent.csv')
        else:
            df = pd.read_csv(path + y + '/combinedtopics_sent.csv')
            train_data = train_data.append(df)   
    
    train_data.to_csv(path + 'combined/combined_all_sent_train.csv', index=False)
    test_data.to_csv(path + 'combined/combined_all_sent_test.csv', index=False)

In [17]:
combine_all_horizontally('2004')

In [18]:
train_df = pd.read_csv('data/processed-sent-merge/combined/combined_all_sent_train.csv')
test_df = pd.read_csv('data/processed-sent-merge/combined/combined_all_sent_test.csv')

In [19]:
train_df = train_df.drop(train_df.columns[0], axis=1)
train_df.head()

Unnamed: 0,dates,candidates_scores,health_scores,immigration_scores,jobs-race_scores,environment_scores,guns_scores,party_scores,economy_scores,winner
0,2016-08-08,1.9,,1.714286,2.0,1.428571,1.375,2.0,1.6875,1
1,2016-08-09,1.68,2.0,1.692308,1.5,1.909091,2.0,1.68,1.8125,1
2,2016-08-10,1.652174,2.0,2.181818,2.142857,2.142857,2.0,1.55,1.818182,1
3,2016-08-11,1.962963,2.0,1.785714,2.0,1.777778,2.0,1.888889,2.142857,1
4,2016-08-12,1.62069,2.0,1.5625,2.5,1.8,1.611111,1.866667,1.818182,1


In [20]:
test_df = test_df.drop(test_df.columns[0], axis=1)
test_df.head()

Unnamed: 0,dates,candidates_scores,health_scores,immigration_scores,jobs-race_scores,environment_scores,guns_scores,party_scores,economy_scores,winner
0,2004-08-02,2.166667,,2.0,2.0,1.333333,2.5,2.333333,1.923077,1
1,2004-08-03,2.0,,2.2,2.0,2.0,2.2,1.777778,2.090909,1
2,2004-08-05,2.0,2.0,2.0,2.0,1.875,2.0,2.4,2.166667,1
3,2004-08-06,1.0,2.0,2.0,1.5,,2.0,2.0,1.875,1
4,2004-08-07,2.285714,2.0,,2.0,2.0,2.0,2.0,1.916667,1


In [21]:
#Fill nan values
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

In [32]:
#trainable_cols=['candidates_scores', 'health_scores', 'immigration_scores', 'jobs-race_scores', 
#                'environment_scores', 'guns_scores', 'party_scores', 'economy_scores'] 

trainable_cols=['candidates_scores', 'health_scores', 'jobs-race_scores', 'guns_scores', 'economy_scores'] 


train_X=train_df[trainable_cols]
train_Y=train_df['winner']

test_X=test_df[trainable_cols]
test_Y=test_df['winner']

In [33]:
test_X.tail()

Unnamed: 0,candidates_scores,health_scores,jobs-race_scores,guns_scores,economy_scores
88,1.918384,2.0,1.0,2.0,1.833333
89,1.918384,2.0,1.666667,2.0,2.0
90,1.918384,1.94974,1.96173,2.0,2.0
91,1.918384,1.94974,2.0,1.8,2.1
92,1.918384,1.94974,1.0,1.949175,2.0


#### Check features validity

In [34]:
logit_model=sm.Logit(train_Y,train_X)
result=logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.675850
         Iterations 4
                         Results: Logit
Model:                Logit            Pseudo R-squared: 0.025   
Dependent Variable:   winner           AIC:              512.8326
Date:                 2020-06-14 15:05 BIC:              532.4270
No. Observations:     372              Log-Likelihood:   -251.42 
Df Model:             4                LL-Null:          -257.85 
Df Residuals:         367              LLR p-value:      0.011934
Converged:            1.0000           Scale:            1.0000  
No. Iterations:       4.0000                                     
-----------------------------------------------------------------
                   Coef.  Std.Err.    z    P>|z|   [0.025  0.975]
-----------------------------------------------------------------
candidates_scores -0.8696   0.3623 -2.4004 0.0164 -1.5796 -0.1595
health_scores     -0.6385   0.3075 -2.0762 0.0379 -1.2412 -0.0358


In [35]:
logreg = LogisticRegression()
logreg.fit(train_X, train_Y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [36]:
pred_y = logreg.predict(test_X)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(test_X, test_Y)))

Accuracy of logistic regression classifier on test set: 0.54


In [41]:
confusion_matrix = confusion_matrix(test_Y, pred_y)
print(confusion_matrix)

[[ 0  0]
 [43 50]]


In [42]:
from sklearn.metrics import classification_report
print(classification_report(test_Y, pred_y))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.54      0.70        93

   micro avg       0.54      0.54      0.54        93
   macro avg       0.50      0.27      0.35        93
weighted avg       1.00      0.54      0.70        93



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
