In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import string
from nltk.corpus import stopwords
import re
from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer, PorterStemmer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
df = pd.read_csv("C:/Users/rowan/Downloads/sentiment_tweets.csv")
df.head()

Unnamed: 0,Index,message to examine,label (depression result)
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10314 entries, 0 to 10313
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Index                      10314 non-null  int64 
 1   message to examine         10314 non-null  object
 2   label (depression result)  10314 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 241.9+ KB


In [27]:
df = df.rename(columns = {"message to examine" : "Text", "label (depression result)": "label"})

In [28]:
df.head()

Unnamed: 0,Index,Text,label
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


In [29]:
df = df.drop_duplicates()

In [30]:
y = df.label

In [31]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: label, dtype: int64

In [32]:
y.value_counts()

0    8000
1    2314
Name: label, dtype: int64

In [33]:
x = df.Text
x.head()

0    just had a real good moment. i missssssssss hi...
1           is reading manga  http://plurk.com/p/mzp1e
2    @comeagainjen http://twitpic.com/2y2lx - http:...
3    @lapcat Need to send 'em to my accountant tomo...
4        ADD ME ON MYSPACE!!!  myspace.com/LookThunder
Name: Text, dtype: object

In [34]:
# function to convert the text to lower case
def convert_to_lowercase(text):
    return text.str.lower()

# function to remove punctuations from the text
def remove_punctuations(text):
    eng_punctuation = string.punctuation
    translator = str.maketrans('','', eng_punctuation)
    return str(text).translate(translator)

# function to remove stopwords from the text
def remove_stopwords(text):
    from nltk.corpus import stopwords
    stopwords = set(stopwords.words('english'))
    return " ".join([word for word in str(text).split() if word not in stopwords])

# function to remove repeating characters
def remove_repeating_characters(text):
    return re.sub(r'(.)1+', r'1', text)

# function to remove numeric text
def remove_numeric(text):
    return re.sub('[0-9]+', '', text)

# Tokenizing the text
def tokenize_text(text):
    tokenizer = RegexpTokenizer('\w+')
    text = text.apply(tokenizer.tokenize)
    return text

# lemmatizing the text. Converting some of the words to their root form. 
def text_lematization(text):
    lm = WordNetLemmatizer()
    text = [lm.lemmatize(word) for word in text]
    return text 

In [35]:
def preprocess(text):
    text = convert_to_lowercase(text)
    text = text.apply(lambda x : remove_punctuations(x))
    text = text.apply(lambda x : remove_stopwords(x))
    text = text.apply(lambda x : remove_repeating_characters(x))
    text = text.apply(lambda x : remove_numeric(x))
    text = tokenize_text(text)
    text = text.apply(lambda x : text_lematization(x))
    text = text.apply(lambda x: " ".join(x))
    return text

In [36]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state = 42)

In [37]:
x_train = preprocess(x_train)

In [38]:
x_test = preprocess(x_test)

In [39]:
vectorizer = TfidfVectorizer()
vectorizer.fit(x_train)

In [40]:
x_train = vectorizer.transform(x_train)

In [41]:
x_test = vectorizer.transform(x_test)

In [42]:
#model
lr_model = LogisticRegression()
lr_model.fit(x_train, y_train)
lr_pred = lr_model.predict(x_test)

In [43]:
print(classification_report(lr_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      2465
           1       0.94      1.00      0.97       630

    accuracy                           0.99      3095
   macro avg       0.97      0.99      0.98      3095
weighted avg       0.99      0.99      0.99      3095



In [44]:
train_lr_pred = lr_model.predict(x_train)
print(classification_report(train_lr_pred, y_train))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      5692
           1       0.93      1.00      0.96      1527

    accuracy                           0.98      7219
   macro avg       0.96      0.99      0.98      7219
weighted avg       0.99      0.98      0.98      7219

