<a href="https://colab.research.google.com/github/prishanmu/SI670_FinalProject/blob/master/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data proces1sing, CSV file I/O (e.g. pd.read_csv)
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn import model_selection
import nltk
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

data = pd.read_csv("../input/political-social-media-posts/political_social_media.csv", encoding = "ISO-8859-1")
data = data.drop(columns = ['_golden', '_unit_state', 'orig__golden', 'audience_gold', 'embed', 'message_gold', '_last_judgment_at', 'bias_gold', 'id', '_unit_id', '_trusted_judgments'])
data.head()

/kaggle/input/political-social-media-posts/political_social_media.csv


Unnamed: 0,audience,audience:confidence,bias,bias:confidence,message,message:confidence,bioid,label,source,text
0,national,1.0,partisan,1.0,policy,1.0,R000596,From: Trey Radel (Representative from Florida),twitter,RT @nowthisnews: Rep. Trey Radel (R- #FL) slam...
1,national,1.0,partisan,1.0,attack,1.0,M000355,From: Mitch McConnell (Senator from Kentucky),twitter,VIDEO - #Obamacare: Full of Higher Costs and ...
2,national,1.0,neutral,1.0,support,1.0,S001180,From: Kurt Schrader (Representative from Oregon),twitter,Please join me today in remembering our fallen...
3,national,1.0,neutral,1.0,policy,1.0,C000880,From: Michael Crapo (Senator from Idaho),twitter,RT @SenatorLeahy: 1st step toward Senate debat...
4,national,1.0,partisan,1.0,policy,1.0,U000038,From: Mark Udall (Senator from Colorado),twitter,.@amazon delivery #drones show need to update ...


In [None]:
##########################
### FILTERING & ENCODING
##########################

###Filter out any rows with less than confidence than 1 for any measure
data = data[data['audience:confidence'] == 1.0]
data = data[data['bias:confidence'] == 1.0]
data = data[data['message:confidence'] == 1.0]
data = data.drop(['audience:confidence', 'bias:confidence','message:confidence' ], axis=1)

###Split and extract info from label column
data[['pol_name','pol_info']] = data.label.str.split("\(",expand=True) 
from_split = data.pol_info.str.split("from",expand=True) 
from_split[1] = from_split[1].str.replace(r'\)$', '')
data['pol_type'] = from_split[0]
data['pol_state'] = from_split[1]
data = data.drop(['label', 'pol_name','pol_info' ], axis=1)

###Encode messages and pol_state with get_dummies
data = pd.concat([data,pd.get_dummies(data['message'], prefix='message')],axis=1).drop(['message'],axis=1)
data = pd.concat([data,pd.get_dummies(data['pol_state'], prefix='pol_state')],axis=1).drop(['pol_state'],axis=1)

###Encode pol_type, bioid, audience, bias, and source isth LabelEncoder
encoder = LabelEncoder()

pol_type_encoded = encoder.fit_transform(data['pol_type'])
data['pol_type'] = pol_type_encoded

audience_encoded = encoder.fit_transform(data['audience'])
data['audience'] = audience_encoded

bias_encoded = encoder.fit_transform(data['bias'])
data['bias'] = bias_encoded

source_encoded = encoder.fit_transform(data['source'])
data['source'] = source_encoded

bioid_encoded = encoder.fit_transform(data['bioid'])
data['bioid'] = bioid_encoded

data.head()

Unnamed: 0,audience,bias,bioid,source,text,pol_type,message_attack,message_constituency,message_information,message_media,...,pol_state_ South Dakota,pol_state_ Tennessee,pol_state_ Texas,pol_state_ Utah,pol_state_ Vermont,pol_state_ Virginia,pol_state_ Washington,pol_state_ West Virginia,pol_state_ Wisconsin,pol_state_ Wyoming
0,1,1,396,1,RT @nowthisnews: Rep. Trey Radel (R- #FL) slam...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,289,1,VIDEO - #Obamacare: Full of Higher Costs and ...,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,430,1,Please join me today in remembering our fallen...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,63,1,RT @SenatorLeahy: 1st step toward Senate debat...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,462,1,.@amazon delivery #drones show need to update ...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
########################
### TEXT PRE-PROCESSING
########################

#Using regular expressions to remove URLs, numbers etc
processed = data['text'].str.replace(r'^.+@[^\.].*\.[a-z]{2,}$',' ')
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',' ')
processed = processed.str.replace(r'http',' ')
processed = processed.str.replace(r'£|\$', ' ')
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',' ')
processed = processed.str.replace(r'\d+(\.\d+)?', ' ')
processed = processed.str.replace(r'[^\w\d\s]', ' ')
processed = processed.str.replace(r'\s+', ' ')
processed = processed.str.replace(r'^\s+|\s+?$', ' ')
processed = processed.str.lower()

#Removing stop words from text
stop_words = set(stopwords.words('english'))
processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stop_words))

#Removing meaningless words from text
processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in ['ûªs', 'û', 'ûªt', 'r', 'en', 'fl', 'p', 'va', 'amp', 'icymi', 'th', 'pm', 'hours', 'u']))


In [None]:
########################
### TEXT PROCESSING
########################

MAXFEATURES = 2000

tfidf_vect = TfidfVectorizer(max_features = MAXFEATURES)

text_tfidf = tfidf_vect.fit_transform(data['text'])
text_tfidf = pd.DataFrame(text_tfidf.toarray())

data = pd.concat([data,text_tfidf],axis=1).drop(['text'],axis=1)
data.head()

Unnamed: 0,audience,bias,bioid,source,pol_type,message_attack,message_constituency,message_information,message_media,message_mobilization,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,1.0,1.0,396.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,289.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,430.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,63.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.158558,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,462.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Below borrowed from [this kernel](https://www.kaggle.com/laiquet/neutral-and-partisan-tweets-posts)

# Our stuff:

## Preprocessing

In [None]:
data = data.dropna()

In [None]:
#train-test split
from sklearn.model_selection import train_test_split
X = data.drop(columns = ['bias'])
y = data['bias']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## Classifier Models

In [None]:
#import statements
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_fscore_support

### Linear SVM

In [None]:
# Linear SVM
from sklearn.svm import LinearSVC

#lsvc = LinearSVC(random_state = 0) #penalty, loss, C

In [None]:
# Linear SVM Grid Search

lsvc = LinearSVC()

param_grid = {
    'penalty': ['l2'],
    'C': [.01,.1, 1],
    'loss' : ["squared_hinge", "hinge"]
}

CV_lsvc = GridSearchCV(estimator=lsvc, param_grid=param_grid, cv = 5)
CV_lsvc.fit(X_train, y_train)

print(CV_lsvc.best_params_)
print(CV_lsvc.best_score_)

In [None]:
clf = LinearSVC(C=1, loss='hinge', penalty='l2', random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='macro')

### Kernelized SVM

In [None]:
# Kernelized SVM
from sklearn.svm import SVC

#svc = SVC(random_state =0)


In [None]:
#Kernelied SVM Grid Search 

svm = SVC()

param_grid = {
     'max_iter':[-1],#change
     'degree':[7],#change
     'C': [1.0]#change
 }

CV_svm = GridSearchCV(estimator=svm, param_grid=param_grid, cv = 5)
CV_svm.fit(X_train, y_train)
print(CV_svm.best_params_)

In [None]:
clf = SVC(C=1.0, degree = 7, max_iter = -1, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='weighted')

In [None]:
print(CV_svm.best_score_)

### Logistic Regression

In [None]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

#lg = LogisticRegression(random_state = 0)

In [None]:
# Logistic Regression Grid Search
lg = LogisticRegression() 

param_grid = { 
    'C': [1,10,100, 200],
    "penalty":["l1","l2"]
}

CV_lg = GridSearchCV(estimator=lg, param_grid=param_grid, cv= 5)
CV_lg.fit(X_train, y_train)
print(CV_lg.best_params_)
print(CV_lg.best_score_)

In [None]:
clf = LogisticRegression(C=1, penalty= 'l1', random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='macro')

### Random Forest Classifier

In [None]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

#rfc = RandomForestClassifier(random_state = 0)

In [None]:
#Random Forest Grid Search
rfc = RandomForestClassifier() 

param_grid = { 
    'n_estimators': [15, 20, 30],#change these
    'max_depth':[15, 20, 30]#change these
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)
print(CV_rfc.best_params_)
print(CV_rfc.best_score_)

In [None]:
clf = RandomForestClassifier(max_depth=30, n_estimators=20, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
precision_recall_fscore_support(y_test, y_pred, average='macro')

## Clustering & Bias Prediction

In [None]:
#elbow graph

from sklearn.cluster import KMeans
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics

cost = []
for k in range(2,10):
    k_means = KMeans(init='k-means++', n_clusters=k, n_init=10)
    k_means.fit(data) #fit to dataframe
    cost.append(k_means.inertia_)

fig, ax = plt.subplots()
plt.plot(range(2,10), cost, 'b*-')
plt.xlim(1, plt.xlim()[1])

In [None]:
#testing with n clusters
n = 4
c_data = data.drop(columns = ['bias', 'bioid']) #remove bias from clustering algorithim to test later

kmeans = KMeans(n_clusters=n, random_state=0) 
kmeans.fit(c_data)

In [None]:
c_data['cluster'] = kmeans.labels_
c_data.groupby('cluster').mean().transpose()

In [None]:
x = c_data.groupby('cluster').mean()

In [None]:
x.to_csv('cluster_means.csv')

In [None]:
c_data.cluster.value_counts()

In [None]:
# add bias back in
c_data = c_data.merge(data)

In [None]:
c_data.head()

In [None]:
# ANOVA (is this most appropriate?) import statements
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

In [None]:
# anova table set up
def anova_table(aov):
    aov['mean_sq'] = aov[:]['sum_sq']/aov[:]['df']
    
    aov['eta_sq'] = aov[:-1]['sum_sq']/sum(aov['sum_sq'])
    
    aov['omega_sq'] = (aov[:-1]['sum_sq']-(aov[:-1]['df']*aov['mean_sq'][-1]))/(sum(aov['sum_sq'])+aov['mean_sq'][-1])
    
    cols = ['sum_sq', 'df', 'mean_sq', 'F', 'PR(>F)', 'eta_sq', 'omega_sq']
    aov = aov[cols]
    return aov

In [None]:
results = ols(formula="bias ~ C(cluster)", data=c_data).fit()
results.summary()

In [None]:
aov_table = sm.stats.anova_lm(results, typ=2) # anova table for bias between cluster groups
anova_table(aov_table) # anova table with effect size

In [None]:
# if results are significant: 

mc = MultiComparison(c_data.bias, c_data.cluster)
mc_results = mc.tukeyhsd(alpha=.05) # sig at .05 level
print(mc_results)
print(mc.groupsunique)

## Word Cloud by Bias 

In [None]:
#import statements
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
#reload data and text preprocess again

In [None]:
data.bias.value_counts()

In [None]:
# for bias = 0

text = data[data['bias'] == 0].text.values

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))

fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [None]:
# for bias = 1

text = data[data['bias'] == 1].text.values

wordcloud = WordCloud(
    width = 3000,
    height = 2000,
    background_color = 'black',
    stopwords = STOPWORDS).generate(str(text))

fig = plt.figure(
    figsize = (40, 30),
    facecolor = 'k',
    edgecolor = 'k')

plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()