In [None]:
import numpy as np 
import os 
import pandas as pd 
import re

## Data Path 
neg_path = "../input/sentimental-analysis-nlp/neg_tweets.txt"
pos_path = "../input/sentimental-analysis-nlp/pos_tweets.txt"


pos_tweets = []
neg_tweets = []

## Read Negative Tweets Data
with open(neg_path, "r", encoding = "utf-8") as f :
    neg_lines = f.read().split("\n")
i = 0
for j, line in enumerate(neg_lines) :
    sent = re.sub(r"[^a-zA-Z1234567890?.,!@]", " ", line)
    sent = sent.split()
    tweet = None
    k = 0
    if len(sent) > 1 and i != 14 :
        tweet = " ".join(word for word in sent if word.find("@") != 0)
    neg_tweets.append(tweet)
    i += 1
print(len(neg_tweets))

In [None]:
## Read Positive Tweets Data
with open(pos_path, "r", encoding = "utf-8") as f :
    pos_lines = f.read().split("\n")
i = 0
for j, line in enumerate(pos_lines[0::]) :
    sent = re.sub(r"[^a-zA-Z1234567890?.,!@]", " ", line)
    sent = sent.split()
    tweet = None
    k = 0
    if len(sent) > 1 and i != 14 :
        tweet = " ".join(word for word in sent if word.find("@") != 0)
    pos_tweets.append(tweet)
    i += 1

print(len(pos_tweets))

In [None]:
## Importing Stopwords and Lemmatizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

## Define Stopwords and Lemmatizer Variable
stop_words = stopwords.words("english")
lemm = WordNetLemmatizer()

##  Find Number of Negatif Tweets That Give Error
## When Do Word Lemmatizer

error_num = [14, 104, 421, 1109]


## Do Stopwords and Word Lemmatizer for Negative Tweets
negative_tweets = []
for p, sent in enumerate(neg_tweets) :
    if p not in error_num :
        sent = re.sub(r"[^a-zA-Z1234567890?.,!]", " ", sent)
        sent = sent.lower()
        sent = sent.split()
        new_sent = " ".join(lemm.lemmatize(word) for word in sent if word not in set(stop_words))
        negative_tweets.append(new_sent)
        


In [None]:
##  Find Number of Positive Tweets That Give Error
## When Do Word Lemmatizer

error_line = [14, 261, 313, 348, 551]
## Do Stopwords and Word Lemmatizer for Positive Tweets
positive_tweets = []
for l, sentence in enumerate(pos_tweets) :
    if l not in error_line :
        sent = re.sub(r"[^a-zA-Z1234567890?.,!]", " ", sentence)
        sent = sent.lower()
        sent = sent.split()
        new_sent = " ".join(lemm.lemmatize(word) for word in sent if word not in set(stop_words))
        positive_tweets.append(new_sent)

In [None]:
## Importing Sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

## Import Imblearn to Do Undersampling
from imblearn.under_sampling import NearMiss

## Concat Negative and Positive Tweets 
## Create Label
all_tweets = positive_tweets + negative_tweets
Y = np.hstack((np.ones(len(positive_tweets)), np.zeros(len(negative_tweets))))

##  Define TFIDF and Make All Tweets Become an Array
vectorizer = TfidfVectorizer()
vectorizer.fit(all_tweets)
X = vectorizer.transform(all_tweets)

## Split Data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 42, 
                                                    test_size = 0.2)

## Do  Undersampling for Train Data
nms = NearMiss(n_neighbors = 3)
X_train, Y_train = nms.fit_resample(X_train, Y_train)

In [None]:
## Importing Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

## Define Classifier
## Fitting the Model
classifier = RandomForestClassifier(n_estimators = 110)
classifier.fit(X_train, Y_train)

## Predict Test Data
Y_predict = classifier.predict(X_test)
label = {1 : "Positive", 0 : "Negative"}

## Get Accuracy Score and Confusion Matrix
accuracy = accuracy_score(Y_test, Y_predict)
cf_matrix = confusion_matrix(Y_test, Y_predict)

print("Accuracy Score : {}".format(accuracy))
print("Confusion Matrix : {}".format(cf_matrix))

In [None]:
## Importing Pickle
import pickle

## Save Clasifier Model and TFIDF
tfidf_file = "vectorizer.pickle"
pickle.dump(vectorizer, open(tfidf_file, "wb"))

model_file = "randomforest.pickle"
pickle.dump(classifier, open(model_file, "wb"))

In [None]:
import seaborn as sn 
import matplotlib.pyplot as plt 
%matplotlib inline

sn.heatmap(cf_matrix, annot = True)
plt.xlabel("Predict")
plt.ylabel("True")
plt.savefig("confusion.png")
plt.show()