In [17]:
#Importing all the necessary libraries

import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
from nltk.corpus import words
nltk.download('stopwords')
# ML Libraries

from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\naras\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\naras\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# define function to read train file

def load_dataset_train(filename):
    dataset_train = pd.read_csv(filename)
    return dataset_train

In [3]:
# define function to read test file

def load_dataset_test(filename):
    dataset_test = pd.read_csv(filename)
    return dataset_test

In [4]:
# define function to remove unwanted columns from train dataset

def remove_unwanted_cols_train(dataset_train, cols):
    for col in cols:
        del dataset_train[col]
    return dataset_train

In [5]:
# define function to remove unwanted columns from test dataset

def remove_unwanted_cols_test(dataset_test, cols):
    for col in cols:
        del dataset_test[col]
    return dataset_test

In [6]:
# define function to clean the tweets

def preprocess_tweet_text(tweet):
    tweet.lower()
    
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    
    # Remove user @ references and '#' from tweet
    tweet = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|(RT)", " ", tweet)
    
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    
    
    # Remove stopwords
    stop = stopwords.words('english')
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop]
    
    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]
    
    return " ".join(filtered_words)

In [7]:
# Define function to implement vectorization (convert text to numbers) using tf-idf technique

def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

In [8]:
# Calling all the function

#load train dataset
dataset_train = load_dataset_train("train.csv")

#Load test dataset
dataset_test = load_dataset_test("test.csv")

# Remove unwanted columns from train dataset
n_dataset_train = remove_unwanted_cols_train(dataset_train, ['screen_name','tweet_id','tweet_source','retweet_count'])

# Remove unwanted columns from test dataset
n_dataset_test = remove_unwanted_cols_test(dataset_test, ['screen_name','tweet_id','tweet_source','retweet_count'])

In [9]:
dataset_test.head()

Unnamed: 0,tweet_text,Sentiment
0,Y'all just mad because ya'll got caught trying...,Negative
1,RT @adams2011: #Retweet\nI WILL VOTE AGAINST T...,Neutral
2,RT @gomee_art: for sakusa it was love at first...,Positive
3,RT @gomee_art: for sakusa it was love at first...,Positive
4,"@brithume @realDonaldTrump Yeah, look at that....",Positive


In [10]:
#Preprocess train data
dataset_train['tweet_text'] = dataset_train['tweet_text'].apply(str)
dataset_train['tweet_text'] = dataset_train['tweet_text'].apply(preprocess_tweet_text)

In [11]:
# Preprocess test data
dataset_test['tweet_text'] = dataset_test['tweet_text'].apply(str)
dataset_test['tweet_text'] = dataset_test['tweet_text'].apply(preprocess_tweet_text)

In [12]:
dataset_test.head()

Unnamed: 0,tweet_text,Sentiment
0,Y mad ya got caught trying party ignored Socia...,Negative
1,Retweet I WILL VOTE AGAINST TRUMP EVEN IN MY L...,Neutral
2,art sakusa love first disinfectant spray haikyuu,Positive
3,art sakusa love first disinfectant spray haikyuu,Positive
4,Yeah look Somebody right thing Oh funny How ma...,Positive


In [13]:
dataset_train.head()

Unnamed: 0,tweet_text,Sentiment
0,These niggas scammers amp drug dealers NOTHING...,Neutral
1,Dear Mr President China already discovered vac...,Neutral
2,Whippits A NASTY DRUG Do Not get shit never th...,Negative
3,Amy Klobuchar admitted Hydroxychloroquine save...,Neutral
4,Amy Klobuchar admitted Hydroxychloroquine save...,Neutral


In [14]:
# Split dataset into Train, Test

# Same tf vector will be used for Testing sentiments on unseen trending data
tf_vector = get_feature_vector(np.array(dataset_train.iloc[:, 0]).ravel())
X = tf_vector.transform(np.array(dataset_train.iloc[:, 0]).ravel())
y = np.array(dataset_train.iloc[:, 1]).ravel()
X_train = X
y_train = y
X_test = tf_vector.transform(np.array(dataset_test.iloc[:, 0]).ravel())
y_test = np.array(dataset_test.iloc[:, 1]).ravel()

#train_test_split(X, y, test_size=0.2, random_state=30)

In [15]:
y_test

array(['Negative', 'Neutral', 'Positive', ..., 'Negative', 'Neutral',
       'Neutral'], dtype=object)

In [19]:
# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)
print(accuracy_score(y_test, y_predict_nb))

0.55869


In [20]:
# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, y_predict_lr))

0.63186


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
# Training SVM model
from sklearn import svm
SVM_model = svm.SVC(kernel='linear',C=0.025,random_state=101)
SVM_model.fit(X_train, y_train)
y_predict_svm = svm_model.predict(X_test)
print(accuracy_score(y_test, y_predict_svm))

In [18]:
# Testing logistic regression model on test data

test_file_name = "test.csv"
test_ds = load_dataset_test(test_file_name)
test_ds = remove_unwanted_cols_test(test_ds, ['screen_name','tweet_id','tweet_source','retweet_count'])


In [19]:
test_ds.head()

Unnamed: 0,tweet_text,Sentiment
0,Y'all just mad because ya'll got caught trying...,Negative
1,RT @adams2011: #Retweet\nI WILL VOTE AGAINST T...,Neutral
2,RT @gomee_art: for sakusa it was love at first...,Positive
3,RT @gomee_art: for sakusa it was love at first...,Positive
4,"@brithume @realDonaldTrump Yeah, look at that....",Positive


In [20]:
# Using Logistic Regression model for prediction on test data

test_prediction_lr = LR_model.predict(X_test)
print(accuracy_score(y_test, test_prediction_lr))

0.63186


In [22]:
#Print test data
data_tweet = dataset_test['tweet_text']
test_result_ds = pd.DataFrame({'tweet_text': data_tweet, 'prediction':test_prediction_lr})
test_result_ds.head(50)

Unnamed: 0,tweet_text,prediction
0,Y mad ya got caught trying party ignored Socia...,Negative
1,Retweet I WILL VOTE AGAINST TRUMP EVEN IN MY L...,Neutral
2,art sakusa love first disinfectant spray haikyuu,Positive
3,art sakusa love first disinfectant spray haikyuu,Positive
4,Yeah look Somebody right thing Oh funny How ma...,Positive
5,art sakusa love first disinfectant spray haikyuu,Positive
6,TrumpIsAnIdiot midnightmitch,Neutral
7,Retweet I WILL VOTE AGAINST TRUMP EVEN IN MY L...,Neutral
8,7 Within 3 years Trump caused much greater dam...,Neutral
9,art sakusa love first disinfectant spray haikyuu,Positive
