In [177]:
import pandas as pd
import re ,nltk
from sklearn.utils import shuffle
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [12]:
def Preprocessing(Tweets):
    processed=[]
    StopWords={sw for sw in nltk.corpus.stopwords.words('english')} #nltk.corpus.stopwords.words('english')
    s={'\'s','i\'d','he\'s'}
    StopWords=StopWords|s
    
    pattern=re.compile(r'https?[^ ]+|@\w+|[^a-zA-Z#\' ]') #remove username and hyperlinks
    for tweet in Tweets:
        if len(tweet)==0:continue
        tweet=re.sub(r'<.*?>','',tweet) #remove html tags
        tweet=pattern.sub('',tweet).lower().split()
        tweet=' '.join([w for w in tweet if not w in StopWords])
        Root=nltk.stem.PorterStemmer()
        RootWords=[Root.stem(w) for w in tweet.split()]
        tweet=str(' '.join(RootWords))
        tweet=tweet.replace("'","").replace("\"","").replace("#","")
        processed.append(tweet)
    return processed   

In [34]:
df_Obama_tweets=df_Obama_tweets=pd.read_excel('trainingObamaRomneytweets.xlsx',sheet_name='Obama')
df_Romney_tweets=pd.read_excel('trainingObamaRomneytweets.xlsx',sheet_name='Romney')

# retains first 4 columns
df_Obama_tweets=df_Obama_tweets[1:].dropna(axis=1,how='all')
df_Romney_tweets=df_Romney_tweets[1:].dropna(axis=1,how='all')

#rename the colums
df_Obama_tweets.columns=['Date','Time','Tweets','Class']
df_Romney_tweets.columns=['Date','Time','Tweets','Class']
df_Obama_tweets.index=range(1,len(df_Obama_tweets)+1)
df_Romney_tweets.index=range(1,len(df_Romney_tweets)+1)

#drop mixed and !!! class for now(we may consider it later)
df_Romney_tweets=df_Romney_tweets[(df_Romney_tweets['Class'].isin((0,1,-1)))]
df_Obama_tweets=df_Obama_tweets[(df_Obama_tweets['Class'].isin((0,1,-1)))]

#drop empty tweet rows
df_Obama_tweets.dropna(subset=['Tweets'],inplace=True)
df_Romney_tweets.dropna(subset=['Tweets'],inplace=True)

#shuffle the dataframe to avoid bias
df_Obama_tweets=shuffle(df_Obama_tweets).reset_index(drop=True)
df_Romney_tweets=shuffle(df_Romney_tweets).reset_index(drop=True)

#raw tweet and class
Obama_tweet=df_Obama_tweets['Tweets'].tolist()
Obama_class=df_Obama_tweets['Class'].tolist()

Romney_tweet=df_Romney_tweets['Tweets'].tolist()
Romney_class=df_Romney_tweets['Class'].tolist()

In [162]:
# preprocess the tweets
Xtrain_Obama=Preprocessing(Obama_tweet)
Xtrain_Romney=Preprocessing(Romney_tweet)

In [163]:
def Parameter_Tuning(parameters,model,X,y):
    gs_clf=GridSearchCV(model,parameters,cv=10,iid=False,n_jobs=-1)
    GS_clf=gs_clf.fit(X,y)
    print("Best Score for the model--->{0}".format(GS_clf.best_score_))
    print("Best Parameters are...")
    print(GS_clf.best_params_)
    

In [164]:
# Multinomail NB...

#Vectorize the tweet and initialize the classifiers
tweet_clf=Pipeline([
    ('count_vect',CountVectorizer(max_df=.7)),
    ('vect_tfidf',TfidfTransformer()),
    ('clf',MultinomialNB()),
])

#tuning the parameters
parameters={
    'count_vect__ngram_range':[(1,1),(1,2)],
    'count_vect__max_df':(.5,.6,.8,.9),
    'vect_tfidf__use_idf':(True,False),
    'clf__alpha':(1e-2,1e-3)
}

Parameter_Tuning(parameters,tweet_clf,Xtrain_Obama,Obama_class)

Best Score for the model--->0.577881355178453
Best Parameters are...
{'clf__alpha': 0.01, 'count_vect__max_df': 0.5, 'count_vect__ngram_range': (1, 2), 'vect_tfidf__use_idf': False}


In [176]:
#Linear SVM
tweet_clf=Pipeline([
    ('count_vect',CountVectorizer(max_df=.7)),
    ('vect_tfidf',TfidfTransformer()),
    ('clf',LinearSVC(loss='hinge', penalty='l2', random_state=42,
                          max_iter=5)),
])


#tuning the parameters
parameters={
    'count_vect__ngram_range':[(1,1),(1,2)],
    'count_vect__max_df':(.5,.6,.8,.9),
    'vect_tfidf__use_idf':(True,False),
    'clf__C':(1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3)
}

Parameter_Tuning(parameters,tweet_clf,Xtrain_Obama,Obama_class)

Best Score for the model--->0.6021825802053665
Best Parameters are...
{'count_vect__max_df': 0.5, 'clf__C': 1.0, 'count_vect__ngram_range': (1, 2), 'vect_tfidf__use_idf': True}




In [None]:
#Logistic Regression
tweet_clf=Pipeline([
    ('count_vect',CountVectorizer(max_df=.7)),
    ('vect_tfidf',TfidfTransformer()),
    ('clf',LinearSVC(loss='hinge', penalty='l2', random_state=42,
                          max_iter=5)),
])


#tuning the parameters
parameters={
    'count_vect__ngram_range':[(1,1),(1,2)],
    'count_vect__max_df':(.5,.6,.8,.9),
    'vect_tfidf__use_idf':(True,False),
    'clf__C':(1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3)
}

Parameter_Tuning(parameters,tweet_clf,Xtrain_Obama,Obama_class)