In [1]:
import nltk
nltk.download()

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk.stem
from nltk.stem import LancasterStemmer
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd
import shutil
import warnings
warnings.filterwarnings(action="ignore")
import re, string
from sklearn.feature_extraction.text import TfidfVectorizer

## Reading Dataset

In [3]:
data= pd.read_csv("../input/sentiment140/training.1600000.processed.noemoticon.csv",encoding="Latin")

In [4]:
data.columns = ["class","ID","Day","no_Query","username",'text']

In [5]:
data = data[["class","text"]]

In [6]:
## Selecting only 10000 data out of total
data = data.groupby('class', 
                group_keys=False).apply(lambda x: x.sample(int(np.rint(10000*len(x)/len(data))))).sample(frac=1).reset_index(drop=True)

In [7]:
data['class'].value_counts()

In [8]:
data["class"] = data["class"].replace({4:1,0:0})

In [9]:
data['class'].value_counts()

## Data preparation for Naive Baiyes

In [10]:
## Storing negative and positive tweets in different dataframe
negative_tweet = data[data["class"] == 0]
positive_tweet = data[data["class"] == 1]

In [11]:
negative_tweets = negative_tweet["text"].tolist()
positive_tweets = positive_tweet["text"].tolist()

In [12]:
negative_tweets[0]

In [13]:
def remove_noise(tweet_tokens, stop_words = ()):

    cleaned_tokens = []

    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('^@http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(^@[A-Za-z0-9_]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words :
            cleaned_tokens.append(token.lower())
    return cleaned_tokens

In [14]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
print(stop_words)
print('\n')
print(remove_noise(word_tokenize(positive_tweets[0]),stop_words))

In [15]:
## creating tokens
positive_tweet_tokens = []
negative_tweet_tokens = []
for sentence in positive_tweets:
    positive_tweet_tokens.append(word_tokenize(sentence))
for sentence in negative_tweets:
    negative_tweet_tokens.append(word_tokenize(sentence))

In [16]:
positive_tweet_tokens[0]

In [17]:
## removing noise such as stopwords and punctuations from data
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

for tokens in positive_tweet_tokens:
    positive_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

for tokens in negative_tweet_tokens:
    negative_cleaned_tokens_list.append(remove_noise(tokens, stop_words))

In [18]:
positive_cleaned_tokens_list[0]

In [19]:
## getting data in format for naive baiyes
def get_tweets_for_model(cleaned_tokens_list):
    for tweet_tokens in cleaned_tokens_list:
        yield dict([token, True] for token in tweet_tokens)

positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

In [20]:
import random

positive_dataset = [(tweet_dict, "Positive")
                     for tweet_dict in positive_tokens_for_model]

negative_dataset = [(tweet_dict, "Negative")
                     for tweet_dict in negative_tokens_for_model]
#print(positive_dataset)
dataset = positive_dataset + negative_dataset

random.shuffle(dataset)

train_data = dataset[:7000]
test_data = dataset[3000:]
#train_data

In [21]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Training Accuracy is:", classify.accuracy(classifier, train_data))
print("Tesing  Accuracy is:", classify.accuracy(classifier, test_data))


print(classifier.show_most_informative_features(10))

In [22]:
from nltk.tokenize import word_tokenize

custom_tweet = "@defenestr8 107 degrees dry heat? I'll take it eff this goddamn 102 degree heat index with 60%+ humidity"
custom_tokens = remove_noise(word_tokenize(custom_tweet))

print(classifier.classify(dict([token, True] for token in custom_tokens)))

## Data preparation for any Machine Learing Algorithm

In [23]:
## remove stopwords and punctuation marks
stuff_to_be_removed = list(stopwords.words('english'))+list(punctuation)
stemmer = LancasterStemmer()
## corpus is all the data in Consumer complaint narrative
corpus = data['text'].tolist()
print(len(corpus))
print(corpus[1])

In [24]:
final_corpus = []
for i in range(len(corpus)):
  word = word_tokenize(corpus[i].lower())
  word = [stemmer.stem(y) for y in word if y not in stuff_to_be_removed]
  j = " ".join(word)
  final_corpus.append(j)
final_corpus[1]

In [25]:
classes=data['class'].tolist()
newdf = pd.DataFrame(final_corpus)
newdf['class'] = classes
newdf.columns=['text','class']
newdf.head()

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [28]:
tfidf = TfidfVectorizer()
vector = tfidf.fit_transform(newdf['text'])
print(type(vector))
X = vector.toarray()
print(X)
y= newdf['class']

In [29]:
## Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify = y)

In [30]:
def metrics(y_train,y_train_pred,y_test,y_test_pred):
    print("training accuracy = ",accuracy_score(y_train,y_train_pred)*100)
    ConfusionMatrixDisplay.from_predictions(y_train,y_train_pred)
    print(classification_report(y_train,y_train_pred))
    plt.show()

    print("testing accuracy = ",accuracy_score(y_test,y_test_pred)*100)
    ConfusionMatrixDisplay.from_predictions(y_test,y_test_pred)
    print(classification_report(y_test,y_test_pred))
    plt.show()

In [31]:
MNB = MultinomialNB()
MNB.fit(X_train,y_train)
y_train_pred = MNB.predict(X_train)
y_test_pred = MNB.predict(X_test)

In [32]:
metrics(y_train,y_train_pred,y_test,y_test_pred)

In [33]:
svc = LinearSVC()
svc.fit(X_train,y_train)
y_train_pred = svc.predict(X_train)
y_test_pred = svc.predict(X_test)

In [34]:
metrics(y_train,y_train_pred,y_test,y_test_pred)

In [39]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators=100,max_depth=10,min_samples_split=3)
RFC.fit(X_train,y_train)
y_train_pred = RFC.predict(X_train)
y_test_pred = RFC.predict(X_test)

In [40]:
metrics(y_train,y_train_pred,y_test,y_test_pred)