In [142]:
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score

In [143]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\potsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\potsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\potsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [144]:
df= pd.read_csv("sentiment_tweets3.csv",index_col=None)

In [145]:
df.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [146]:
df=df.drop('Index',axis=1)

In [147]:
df.head()

Unnamed: 0,message to examine,label (depression result)
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga http://plurk.com/p/mzp1e,0
2,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [148]:
def remove_urls(text):
    pattern=re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [149]:
def remove_htmls(text):
    pattern=re.compile('<.*?>')
    return pattern.sub(r'',text)

In [150]:
wlm=WordNetLemmatizer()

In [151]:
#text preprocessing

In [152]:
def clean_text(text):
    text=text.lower()
    text= remove_urls(text)
    text=remove_htmls(text)
    text=nltk.word_tokenize(text)
    y=[]
    for i in text:
        if i.isalnum():
            y.append(i)
    text=y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
    text=y[:]
    y.clear()
    for i in text:
        y.append(wlm.lemmatize(i))
    return " ".join(y)
    

In [153]:
df['message to examine']

0        just had a real good moment. i missssssssss hi...
1               is reading manga  http://plurk.com/p/mzp1e
2        @comeagainjen http://twitpic.com/2y2lx - http:...
3        @lapcat Need to send 'em to my accountant tomo...
4            ADD ME ON MYSPACE!!!  myspace.com/LookThunder
                               ...                        
10309    No Depression by G Herbo is my mood from now o...
10310    What do you do when depression succumbs the br...
10311    Ketamine Nasal Spray Shows Promise Against Dep...
10312    dont mistake a bad day with depression! everyo...
10313                                                    0
Name: message to examine, Length: 10314, dtype: object

In [154]:
df.rename(columns = {"message to examine": "message", "label (depression result)":"label"},inplace= True)

In [155]:
df.head()

Unnamed: 0,message,label
0,just had a real good moment. i missssssssss hi...,0
1,is reading manga http://plurk.com/p/mzp1e,0
2,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,@lapcat Need to send 'em to my accountant tomo...,0
4,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [156]:
df['message']=df['message'].apply(clean_text)

In [157]:
df.head()

Unnamed: 0,message,label
0,real good moment miss much,0
1,reading manga,0
2,comeagainjen,0
3,lapcat need send accountant tomorrow oddly eve...,0
4,add myspace,0


In [158]:
tfidf=TfidfVectorizer()

In [159]:
X_tfidf=tfidf.fit_transform(df['message'])

In [160]:
Y=df['label'].values

In [187]:
Y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [161]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf,
    Y,
    test_size=0.2,
    stratify=Y
)

In [162]:
classifier1=MultinomialNB()

In [163]:
classifier1.fit(X_train, y_train)

In [164]:
y_pred1=classifier1.predict(X_test)

In [165]:
print("Accuracy:",accuracy_score(y_test,y_pred1))

Accuracy: 0.9001454192922927


In [166]:
classifier2=BernoulliNB()

In [167]:
classifier2.fit(X_train, y_train)

In [168]:
y_pred2= classifier2.predict(X_test)

In [169]:
print("Accuracy:", accuracy_score(y_test,y_pred2))

Accuracy: 0.9660688317983519


In [170]:
# here BernoulliNB gives the highest accuracy because it is used for binary classification, this project is binary classification

In [171]:
#let's write for new predictions

In [195]:
input_text=input("Enter your text")


Enter your text @TheBloggess If you click on this, my Twit handle, you'll find my FB in my bio. Partly introvert, dash of extrovert in the right arena. EMAPTH. I love all living things, working on enlightenment, I'm weird so I stay home a lot. Fought depression for 30+yr


In [196]:
input_text1=clean_text(input_text)
tfidf_input=tfidf.transform([input_text1])
print(input_text1)
result=classifier2.predict(tfidf_input)[0]


thebloggess click twit handle find fb bio partly introvert dash extrovert right arena emapth love living thing working enlightenment weird stay home lot fought depression


In [197]:
if result==0:
    print("No Depression")
elif result==1:
    print("Depression")

Depression
