In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer 
import html
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
import unicodedata

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [None]:
import sys  
!{sys.executable} -m pip install contractions

In [None]:
import contractions

In [None]:
data=pd.read_csv('../input/twittersentimentsdata/TwitterSentimentAnalysis.csv', encoding='latin-1', header=None)
data.head()

In [None]:
data.shape

In [None]:
data.columns=["Sentiment", "Id", "Date", "None", "UserId", "Tweet"]
data.head()

In [None]:
data.drop(columns=["Id", "Date", "None", "UserId"], inplace=True)
data.head()

In [None]:
data["Sentiment"].value_counts()

In [None]:
data1=pd.concat([(data.iloc[0:5000, :]), (data.iloc[1595000:,:])], axis=0, ignore_index=True)
data1.index=range(len(data1))

In [None]:
X=(data1["Tweet"]).to_frame()
Y=(data1["Sentiment"]).to_frame()
X.shape
Y.shape

In [None]:
X.head()
Y.head()

In [None]:
X['Tweet']=X['Tweet'].str.lower()

In [None]:
X['Tweet'][0]

In [None]:
len(X['Tweet'])

In [None]:
# function to remove accented characters
def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text

X1=X
for i in range(len(X['Tweet'])):
    x=X['Tweet'][i]
    x=html.unescape(x)
    x=BeautifulSoup(x, "lxml").text
    x=re.sub(r"http[s]?://\S+", "", x)
    x=re.sub(r"@\w+", "", x)
    x=re.sub("\S*\d\S*", "", x).strip()
    X1['Tweet'][i]=remove_accented_chars(x)

In [None]:
X1['Tweet'][0]

In [None]:
for i in range(len(X1['Tweet'])):
    X1['Tweet'][i]=contractions.fix(X1['Tweet'][i])

In [None]:
X1['Tweet'][0]

In [None]:
tokens=re.findall(r'\w+', (X1["Tweet"][0]))
tokens

In [None]:
type(X1)
X1=X1.Tweet.apply(lambda x: re.findall(r'\w+', (x)))
X1.head()
type(X1)
X1=X1.to_frame()

In [None]:
type(X1)

In [None]:
X1['Tweet'][0]

In [None]:
stopwords1=list(set(stopwords.words('english')))
len(stopwords1)
stopwords1

In [None]:
for i in range(len(X1['Tweet'])):
    a=[]
    for x in X1['Tweet'][i]:
        if(x not in stopwords1):
            a.append(x)
    X1['Tweet'][i]=a

In [None]:
X1['Tweet'][0]

In [None]:
#the words should be passed as a list, even if it's single word
nltk.pos_tag(X1['Tweet'][0])

In [None]:
#lemmatization
def get_pos(word):
    tag=nltk.pos_tag([word])[0][1][0].upper()
    tag_dict={"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) #NOUN is the default pos tag

lemmatizer=WordNetLemmatizer()

for i in range(len(X1)):
    X1['Tweet'][i]=[lemmatizer.lemmatize(w, get_pos(w)) for w in X1['Tweet'][i]]

In [None]:
X1['Tweet'][0]
X1['Tweet'][1]

In [None]:
tweets=list(X1['Tweet'])

def tokenizing(text):
    return text

vectorizer = TfidfVectorizer(tokenizer=tokenizing, preprocessor=tokenizing, lowercase=False)
vectors = vectorizer.fit_transform(tweets)
features=vectorizer.get_feature_names()
print(vectors.shape)

In [None]:
#Series.to_dense() function return dense representation of NDFrame (as opposed to sparse).
#This basically mean that memory will be allocated to store even the missing values in the dataframe.

dense = vectors.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=features)

In [None]:
features[0:50]
features[10980:]

In [None]:
df.head()

In [None]:
df=df.drop(df.columns[0:12], axis=1)
df.shape
df.head()

In [None]:
#taking 20% data of 10,000 as test data, i.e., 2,000
#hence, taking first 1000 and last 1000
x_test=pd.concat([(df.iloc[0:1000, :]), (df.iloc[9000:,:])], axis=0, ignore_index=True)
x_train=(df.iloc[1000:9000, :])
x_train.index=range(len(x_train))
y_test=pd.concat([(Y.iloc[0:1000, :]), (Y.iloc[9000:,:])], axis=0, ignore_index=True)
y_train=(Y.iloc[1000:9000, :])
y_train.index=range(len(y_train))
x_test.shape
x_train.shape
y_test["Sentiment"].value_counts()
y_train["Sentiment"].value_counts()

In [None]:
x_train.head()
x_train.tail()
y_train.head()
y_train.tail()

In [None]:
def fit_model_accuracy(model, x_train, y_train, x_test, y_test):
    k=StratifiedKFold(n_splits=3, shuffle=True, random_state=1)
    a=cross_validate(model, x_train, y_train, scoring='accuracy', cv=k,return_train_score=True)
    print(a, a['test_score'].mean(), a['test_score'].std())
    
    model.fit(x_train, y_train)
    predictions=model.predict(x_test)
    print("Train Accuracy :: ", accuracy_score(y_train, model.predict(x_train)))
    print("Test Accuracy  :: ", accuracy_score(y_test, predictions))
    print(" Confusion matrix \n", confusion_matrix(y_test, predictions))

In [None]:
model_svc=SVC(kernel='linear')
fit_model_accuracy(model_svc, x_train, np.ravel(y_train), x_test, y_test)