In [2]:
import pandas as pd

pd.set_option('display.max_colwidth', -1)

pd.set_option('mode.chained_assignment', None)

df=pd.read_csv('C:/Users/deCaY/Desktop/final_tweet_collection.csv', low_memory=False)

In [4]:
import re


import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import itertools

def tweet_cleaner(df):
    
    for i in range(0,len(df)):
        txt = df.loc[i,'tweet_text']
        x = re.sub( "[#./:&;’“]+[a-zA-z0-9-]*", "", txt)                #Remove any punctuation marks and hashtags
        x = re.sub( "https?", "", x)                                                #Remove URLs
        x = re.sub("/\b\d[^th][^nd][^st][^rd][a-zA-z]+", "", x)    #Remove typos beginning with numbers
        x=" ".join(x.split())                                                          #Remove unnecessary blank spaces
        df.loc[i,'tweet_text']=x
    
    all_tweets_no_urls = df['tweet_text']
    
    #Tokenize the string(can also be performed using nltk.tokenize)


    words_in_tweet = [tweet.lower().split() for tweet in all_tweets_no_urls]
    
    #Stopwords are words which do not add much meaning to a sentence
    #They can safely be ignored without sacrificing the meaning of the sentence
    #These add no value to the analysis and removing them will improve processsing time and accuracy of the classifier


    stop_words = set(stopwords.words('english'))
    stop_words.update(['u','ur','i',"i’m","i’ve"])
    
    tweets_nsw = [[word for word in tweet_words if not word in stop_words]
              for tweet_words in words_in_tweet]
    
    #itertools.chain is a function that takes a series of iterables and returns one iterable
    #Create a single list of individual words


    all_words_nsw = list(itertools.chain(*tweets_nsw))

    #Some of the tweets contain phrases in Hindi, Spanish and other languages
    #Remove all non-English words


    words = set(nltk.corpus.words.words())

    eng_only=[]
    for i in all_words_nsw:
        if i.lower() in words:
            eng_only.append(i)
    
    #Lemmatization is the process of normalizing words into their base or root form


    cleaned=[]   
    lemmatizer = WordNetLemmatizer()
    for word in eng_only:
        cleaned.append(lemmatizer.lemmatize(word))
    
    return(cleaned)

In [5]:
positive_tweets = df[df['sentiments'] == 'Positive']

positive_tweets.reset_index(inplace=True)

cleaned_positive_tweets = tweet_cleaner(positive_tweets)


negative_tweets = df[df['sentiments'] == 'Negative']

negative_tweets.reset_index(inplace=True)

cleaned_negative_tweets = tweet_cleaner(negative_tweets)


miscellaneous_tweets = df[df['sentiments'] == 'Miscellaneous']

miscellaneous_tweets.reset_index(inplace=True)

cleaned_miscellaneous_tweets = tweet_cleaner(miscellaneous_tweets)


all_cleaned_tweets = cleaned_positive_tweets + cleaned_negative_tweets + cleaned_miscellaneous_tweets

In [6]:
from sklearn.feature_extraction.text import CountVectorizer


cv = CountVectorizer()
cv.fit(all_cleaned_tweets)
X = cv.transform(all_cleaned_tweets)

In [7]:
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score

from sklearn.model_selection import train_test_split


target=[]

for i in range(len(all_cleaned_tweets)):
    if (i<len(cleaned_positive_tweets)):
        target.append(2)
    elif (i>len(cleaned_positive_tweets) & i<len(cleaned_positive_tweets)+len(cleaned_negative_tweets)):
        target.append(1)
    else :
        target.append(0)



X_train, X_val, y_train, y_val = train_test_split(X, target, train_size = 0.75, random_state=0)


for c in [0.01, 0.05, 0.25, 0.5, 1]:

    lr = LogisticRegression(C=c, solver='lbfgs', multi_class='auto', max_iter=1000, random_state=0)

    lr.fit(X_train, y_train)

    print ("Accuracy for C=%s: %s"% (c, accuracy_score(y_val, lr.predict(X_val))))

Accuracy for C=0.01: 0.7126373626373627
Accuracy for C=0.05: 0.7159340659340659
Accuracy for C=0.25: 0.7236263736263736
Accuracy for C=0.5: 0.7236263736263736
Accuracy for C=1: 0.7230769230769231
