## TWITTER SENTIMENT ANALYSIS

In [1]:
import pandas as pd

df=pd.read_csv(r"C:\Users\Pc\OneDrive\Desktop\Deep Learning\NLP\twitter_nlp.csv",encoding="latin1",header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [2]:
df.rename(columns={0:'Sentiment',5:'Text'},inplace=True)

In [3]:
df=df[['Text','Sentiment']]

In [4]:
df.head()

Unnamed: 0,Text,Sentiment
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0
1,is upset that he can't update his Facebook by ...,0
2,@Kenichan I dived many times for the ball. Man...,0
3,my whole body feels itchy and like its on fire,0
4,"@nationwideclass no, it's not behaving at all....",0


In [5]:
df['Sentiment'].value_counts()

Sentiment
0    800000
4    800000
Name: count, dtype: int64

In [6]:
df.isnull().sum()

Text         0
Sentiment    0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(16309)

In [8]:
df=df.drop_duplicates()

In [9]:
df.shape

(1583691, 2)

### TEXT NORMALIZATION

In [10]:
import re

def normalize_text(text):
    text=text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)   
    text = re.sub(r"@\w+", "", text)                     # } PUNCTUATION CLEANING 
    text = re.sub(r"#\w+", "", text)                      
    text = re.sub(r"[^a-zA-Z\s]", "", text)   #NUMBERS HANDLING            
    text = re.sub(r"\s+", " ", text).strip()   #WHITE SPACE REMOVE           
    return text

df["Clean_Text"] = df["Text"].apply(normalize_text)
df.head()

Unnamed: 0,Text,Sentiment,Clean_Text
0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0,a thats a bummer you shoulda got david carr of...
1,is upset that he can't update his Facebook by ...,0,is upset that he cant update his facebook by t...
2,@Kenichan I dived many times for the ball. Man...,0,i dived many times for the ball managed to sav...
3,my whole body feels itchy and like its on fire,0,my whole body feels itchy and like its on fire
4,"@nationwideclass no, it's not behaving at all....",0,no its not behaving at all im mad why am i her...


In [11]:
df.drop('Text',axis=1,inplace=True)

### TOKENIZATION

In [12]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")

df["tokens"] = df["Clean_Text"].apply(word_tokenize)
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Sentiment,Clean_Text,tokens
0,0,a thats a bummer you shoulda got david carr of...,"[a, thats, a, bummer, you, shoulda, got, david..."
1,0,is upset that he cant update his facebook by t...,"[is, upset, that, he, cant, update, his, faceb..."
2,0,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage..."
3,0,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its..."
4,0,no its not behaving at all im mad why am i her...,"[no, its, not, behaving, at, all, im, mad, why..."


### STOPWORD REMOVAL

In [13]:
from nltk.corpus import stopwords
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df["tokens_no_stop"] = df["tokens"].apply(remove_stopwords)
df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Sentiment,Clean_Text,tokens,tokens_no_stop
0,0,a thats a bummer you shoulda got david carr of...,"[a, thats, a, bummer, you, shoulda, got, david...","[thats, bummer, shoulda, got, david, carr, thi..."
1,0,is upset that he cant update his facebook by t...,"[is, upset, that, he, cant, update, his, faceb...","[upset, cant, update, facebook, texting, might..."
2,0,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest..."
3,0,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]"
4,0,no its not behaving at all im mad why am i her...,"[no, its, not, behaving, at, all, im, mad, why...","[behaving, im, mad, cant, see]"


### STEMMING (PORTER STEMMER)

In [14]:
# from nltk.stem import PorterStemmer

# stemmer = PorterStemmer()

# df["stemmed"] = df["tokens_no_stop"].apply(lambda x: [stemmer.stem(word) for word in x])
# df.head()

### LEMMATIZATION 

In [15]:
import spacy
nlp = spacy.load("en_core_web_md", disable=["parser", "ner", "textcat"])

In [16]:
texts = df["tokens_no_stop"].apply(lambda x: " ".join(x)).tolist()

In [17]:
lemmatized = []

for doc in nlp.pipe(texts, batch_size=2000, n_process=4):  
    lemmas = [token.lemma_ for token in doc]
    lemmatized.append(lemmas)

df["lemmatized"] = lemmatized

In [18]:
df["lemmatized"].head()

0    [that, s, bummer, shoulda, get, david, carr, t...
1    [upset, can, not, update, facebook, texting, m...
2    [dive, many, time, ball, manage, save, rest, g...
3               [whole, body, feel, itchy, like, fire]
4                   [behave, I, m, mad, can, not, see]
Name: lemmatized, dtype: object

### FINAL PREPROCESSED TEXT (For ML Models)

In [19]:
df["clean_text"] = df["lemmatized"].apply(lambda x: " ".join(x))

In [20]:
df.head()

Unnamed: 0,Sentiment,Clean_Text,tokens,tokens_no_stop,lemmatized,clean_text
0,0,a thats a bummer you shoulda got david carr of...,"[a, thats, a, bummer, you, shoulda, got, david...","[thats, bummer, shoulda, got, david, carr, thi...","[that, s, bummer, shoulda, get, david, carr, t...",that s bummer shoulda get david carr third day
1,0,is upset that he cant update his facebook by t...,"[is, upset, that, he, cant, update, his, faceb...","[upset, cant, update, facebook, texting, might...","[upset, can, not, update, facebook, texting, m...",upset can not update facebook texting might cr...
2,0,i dived many times for the ball managed to sav...,"[i, dived, many, times, for, the, ball, manage...","[dived, many, times, ball, managed, save, rest...","[dive, many, time, ball, manage, save, rest, g...",dive many time ball manage save rest go bound
3,0,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[whole, body, feels, itchy, like, fire]","[whole, body, feel, itchy, like, fire]",whole body feel itchy like fire
4,0,no its not behaving at all im mad why am i her...,"[no, its, not, behaving, at, all, im, mad, why...","[behaving, im, mad, cant, see]","[behave, I, m, mad, can, not, see]",behave I m mad can not see


In [21]:
clean_df=df[['clean_text','Sentiment']]

In [22]:
clean_df.head()

Unnamed: 0,clean_text,Sentiment
0,that s bummer shoulda get david carr third day,0
1,upset can not update facebook texting might cr...,0
2,dive many time ball manage save rest go bound,0
3,whole body feel itchy like fire,0
4,behave I m mad can not see,0


In [23]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1583691 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   clean_text  1583691 non-null  object
 1   Sentiment   1583691 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 36.2+ MB


In [24]:
clean_df.duplicated().sum()

np.int64(104676)

In [25]:
clean_df = clean_df.drop_duplicates(subset="clean_text", keep="first")

In [26]:
clean_df.duplicated().sum()

np.int64(0)

In [27]:
clean_df.shape

(1472167, 2)

In [28]:
clean_df['Sentiment'].value_counts()

Sentiment
0    744754
4    727413
Name: count, dtype: int64

In [29]:
clean_df['Sentiment'] = clean_df['Sentiment'].replace({4: 1})

### MODEL BUILDING

In [30]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(clean_df["clean_text"],clean_df["Sentiment"],test_size=0.2,random_state=42)

In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=30000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 60

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post")

In [32]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

model = Sequential([
    Embedding(input_dim=30000, output_dim=128, input_length=max_len,input_shape=(max_len,)),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


  super().__init__(**kwargs)


In [36]:
history = model.fit(
    X_train_pad, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.1)

Epoch 1/5
[1m16562/16562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m810s[0m 49ms/step - accuracy: 0.5061 - loss: 0.6931 - val_accuracy: 0.5053 - val_loss: 0.6931
Epoch 2/5
[1m16562/16562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m840s[0m 51ms/step - accuracy: 0.5061 - loss: 0.6931 - val_accuracy: 0.5053 - val_loss: 0.6931
Epoch 3/5
[1m16562/16562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m780s[0m 47ms/step - accuracy: 0.5059 - loss: 0.6931 - val_accuracy: 0.5053 - val_loss: 0.6931
Epoch 4/5
[1m16562/16562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m792s[0m 48ms/step - accuracy: 0.5060 - loss: 0.6931 - val_accuracy: 0.5053 - val_loss: 0.6931
Epoch 5/5
[1m16562/16562[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m804s[0m 49ms/step - accuracy: 0.5062 - loss: 0.6931 - val_accuracy: 0.5053 - val_loss: 0.6931


In [37]:
model.evaluate(X_test_pad, y_test)

[1m9202/9202[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 6ms/step - accuracy: 0.5049 - loss: 0.6931


[0.6931437253952026, 0.5048975348472595]

In [38]:
def predict_sentiment(text):
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(pad)[0][0]

    if pred > 0.5:
        return "Positive", pred
    else:
        return "Negative", pred
predict_sentiment("he is good, but some times cruel")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step


('Negative', np.float32(0.49922666))