# Impoting required libraries

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from math import log, sqrt
import pandas as pd
import numpy as np
import re
from sklearn.datasets import load_files

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ravi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ravi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Loading data set 

In [2]:
tweets = pd.read_csv('sentiment_tweets3.csv')
tweets.head(20)

Unnamed: 0.1,Unnamed: 0,message,label
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0
5,624,so sleepy. good times tonight though,0
6,701,"@SilkCharm re: #nbn as someone already said, d...",0
7,808,23 or 24ï¿½C possible today. Nice,0
8,1193,nite twitterville workout in the am -ciao,0
9,1324,"@daNanner Night, darlin'! Sweet dreams to you",0


In [3]:
tweets.drop(['Unnamed: 0'], axis = 1, inplace = True)

In [4]:
tweets['label'].value_counts()

0    8000
1    2314
Name: label, dtype: int64

In [5]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   message  10314 non-null  object
 1   label    10314 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 161.3+ KB


In [6]:
X, y = tweets.message, tweets.label

In [7]:
X.head()

0    just had a real good moment. i missssssssss hi...
1           is reading manga  http://plurk.com/p/mzp1e
2    @comeagainjen http://twitpic.com/2y2lx - http:...
3    @lapcat Need to send 'em to my accountant tomo...
4        ADD ME ON MYSPACE!!!  myspace.com/LookThunder
Name: message, dtype: object

# Applying stemming

In [8]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [9]:
corpus = []
for i in range(0, len(tweets)):
    review = re.sub('[^a-zA-Z]', ' ', tweets['message'][i])
    review = re.sub(r'\s+[a-zA-Z]\s+', ' ', tweets['message'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [10]:
print(corpus[0:10])

['real good moment. missssssssss much,', 'read manga http://plurk.com/p/mzp1', '@comeagainjen http://twitpic.com/2y2lx - http://www.youtube.com/watch?v=zogfqvh2me8', "@lapcat need send 'em account tomorrow. oddly, even refer taxes. support evidence, though.", 'add myspace!!! myspace.com/lookthund', 'sleepy. good time tonight though', '@silkcharm re: #nbn someon alreadi said, fiber home mean least regular', '23 24ï¿½c possibl today. nice', 'nite twittervil workout -ciao', "@danann night, darlin'! sweet dream"]


In [11]:
len(tweets)

10314

## Vectorizing the dataset using Tf_IDF method

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfconverter = TfidfVectorizer(max_features=1500, min_df=1, max_df=1.0, stop_words=stopwords.words('english'))
X = tfidfconverter.fit_transform(corpus) 

# splitting the dataset

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Trainig the model using random forest classifier

In [14]:
classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

<IPython.core.display.Javascript object>

RandomForestClassifier(n_estimators=1000, random_state=0)

# Testing the model using test data

In [15]:
y_pred = classifier.predict(X_test)

# generating performance report

In [16]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1572    1]
 [   4  486]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1573
           1       1.00      0.99      0.99       490

    accuracy                           1.00      2063
   macro avg       1.00      1.00      1.00      2063
weighted avg       1.00      1.00      1.00      2063

0.9975763451284537


# The trained mode is stored as a pickle file, so that we dont have to train the model everytime

In [17]:
import pickle
with open('text_classifier', 'wb') as picklefile:
    pickle.dump(classifier,picklefile)

In [25]:

import pickle
with open('tf_idf', 'wb') as picklefile:
    pickle.dump(tfidfconverter,picklefile)

# Testing the model (Dynamic)

In [18]:
input_text="Loving how me and my lovely partner is talking about what we want."

In [19]:

input_text1 = [input_text]
corpus1 = []
for i in range(0, len(input_text1)):
    review = re.sub('[^a-zA-Z]', ' ', input_text1[i])
    review = re.sub(r'\s+[a-zA-Z]\s+', ' ', input_text1[i])
    review = review.lower()
    review = review.split()

    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus1.append(review)
       

M = tfidfconverter.transform(corpus1)



In [47]:
print(corpus1)

['love love partner talk want.']


In [24]:
predicted=classifier.predict(M)
if (predicted[0]==0):
    print('The person is not going through depression')
else:
    print('The person is going through depression')

The person is not going through depression
