In [None]:
#importing packages 
import numpy as np
import pandas as pd
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
import string 
import re

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#downloading stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
#loading dataset
train_data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tweet sentiment analysis/train.csv')
test_data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tweet sentiment analysis/test.csv')

In [None]:
'''given a training sample of tweets and labels, where label '1' denotes the tweet is racist/sexist 
   and label '0' denotes the tweet is not racist/sexist,'''

train_data

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [None]:
#process tweets
def process_tweet(tweet):
    stemmer=PorterStemmer()
    stopwords_english = stopwords.words('english') #stopwords english
    
    #removing all hashtags ,hyperlinks
    tweet = re.sub(r'\$\w*', '', tweet)
    tweet = re.sub(r'^RT[\s]+', '', tweet)
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
    tweet = re.sub(r'#', '', tweet)
    
    tokenizer=TweetTokenizer(preserve_case=False, strip_handles=True,reduce_len=True)
    tweet_tokens=tokenizer.tokenize(tweet) #tokenize
    
    tweets_clean=[]
    
    #removing stopwords, removing punctuation and then stemming the word
    for word in tweet_tokens:
        if (word not in stopwords_english and word not in string.punctuation):   
            stem_word=stemmer.stem(word)
            tweets_clean.append(stem_word)
            
    return tweets_clean

In [None]:
# building frequences for tweets 
def build_freq(tweets,ys):
    yslist=np.squeeze(ys).tolist()
    freqs={}
    for y,tweet in zip(yslist,tweets):
        for word in process_tweet(tweet):
            pair=(word,y)
            if pair in freqs:
                freqs[pair]+=1
            else:
                freqs[pair]=1
    return freqs
    

In [None]:
train_x=train_data['tweet'].tolist()
train_y=train_data['label'].tolist()

In [None]:
freqs=build_freq(train_x,train_y)

In [None]:
# extract features from processed tweets 
def extract_features(tweet,freqs):
    word_l=process_tweet(tweet)
    x=np.zeros((1,3))
    
    #bais term =1
    x[0,0]=1
    
    
    for word in word_l:
        x[0,1]+=freqs.get((word,0.0),0) #non racist
        x[0,2]+=freqs.get((word,1.0),0) #racist
        
    return x
#  [1,pos_freq,neg_freq]

In [None]:
#Training model
X = np.zeros((len(train_x), 3))

for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)
y=np.array(train_y)


In [None]:
#accuracy
from sklearn.metrics import accuracy_score
def acc_score(y_true,y_pred):
  
  acc_sc =accuracy_score(y_true,y_pred)
  return acc_sc

In [None]:
#logistic regression classifier model
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(random_state = 0)
lr_model.fit(X_train, y_train)

y_pred=lr_model.predict(X_test)
lr_acc=acc_score(y_test,y_pred)
print("Lr accuracy:",lr_acc)

Lr accuracy: 0.9371187236039418


In [None]:
#support vector machine model
from sklearn.svm import SVC
svm_model = SVC(kernel = 'sigmoid', random_state = 0)
svm_model.fit(X_train, y_train)
y_pred=svm_model.predict(X_test)

svm_acc=acc_score(y_test,y_pred)
print("SVM accuracy:",svm_acc)

SVM accuracy: 0.8831534490849366


In [None]:
#KNN classifier model
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn_model.fit(X_train, y_train)
y_pred=knn_model.predict(X_test)

knn_acc=acc_score(y_test,y_pred)
print("KNN accuracy:",knn_acc)

KNN accuracy: 0.9493195682778038


In [None]:
#my_tweet=train_x[31960]
#my_tweet =  '@user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'       #non racist
my_tweet =  '@user #sikh #temple vandalised in in #calgary, #wso condemns  act'        #racist
x=extract_features(my_tweet,freqs)
x = np.squeeze(np.asarray(x))
prediction=svm_model.predict([x])
if(prediction==1):
  print('tweet predicted as :racist tweet')
else:
  print('tweet predicted as : non racist')

tweet predicted as :racist tweet


lstm rnn

In [None]:
#importing packages
import numpy as np
import pandas as pd
import  string 
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM


In [None]:
#loading dataset
df=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/tweet sentiment analysis/train.csv')
df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [None]:
#creating vocablory 
vocab=[]
for tweet in df.tweet:
  for word in process_tweet(tweet):
    if word not in vocab:
      vocab.append(word)

  

In [None]:
print("length of vocab:",len(vocab))

length of vocab: 36805


In [None]:
#process tweet
processed_tweet=[]
for t in df.tweet:
  p_t=process_tweet(t)
  processed_tweet.append(p_t)

In [None]:
processed_tweet[0]

['father', 'dysfunct', 'selfish', 'drag', 'kid', 'dysfunct', 'run']

In [None]:
df['p_tweet']=processed_tweet
df.head()

Unnamed: 0,id,label,tweet,p_tweet
0,1,0,@user when a father is dysfunctional and is s...,"[father, dysfunct, selfish, drag, kid, dysfunc..."
1,2,0,@user @user thanks for #lyft credit i can't us...,"[thank, lyft, credit, can't, use, caus, offer,..."
2,3,0,bihday your majesty,"[bihday, majesti]"
3,4,0,#model i love u take with u all the time in ...,"[model, love, u, take, u, time, urð, , , ±, ..."
4,5,0,factsguide: society now #motivation,"[factsguid, societi, motiv]"


In [None]:
#transform label 
label_transform=[]
for i in df.label:
  if i == 0:
    label_transform.append([1,0])
  else:
    label_transform.append([0,1])


In [None]:
df['label_tran']=label_transform
df.head()

Unnamed: 0,id,label,tweet,p_tweet,label_tran
0,1,0,@user when a father is dysfunctional and is s...,"[father, dysfunct, selfish, drag, kid, dysfunc...","[1, 0]"
1,2,0,@user @user thanks for #lyft credit i can't us...,"[thank, lyft, credit, can't, use, caus, offer,...","[1, 0]"
2,3,0,bihday your majesty,"[bihday, majesti]","[1, 0]"
3,4,0,#model i love u take with u all the time in ...,"[model, love, u, take, u, time, urð, , , ±, ...","[1, 0]"
4,5,0,factsguide: society now #motivation,"[factsguid, societi, motiv]","[1, 0]"


In [None]:
tokenizer = Tokenizer(num_words=36805, split=' ') 
tokenizer.fit_on_texts(df['p_tweet'].values)
X = tokenizer.texts_to_sequences(df['p_tweet'])
X=pad_sequences(X)
y=df.label_tran.tolist()
y=np.array(y)

In [None]:
embeding_dim=128
lstm_out=196
model = Sequential()
model.add(Embedding(input_dim=len(vocab),output_dim=embeding_dim,input_length = 28))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))




In [None]:
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.15, random_state = 42)

In [None]:
model.fit(X_train, y_train,validation_data = (X_test,y_test),epochs = 1, batch_size=32)



<keras.callbacks.History at 0x7f39592d9850>

In [None]:
model.evaluate(X_test,y_test)



[0.11375043541193008, 0.9618352651596069]

In [None]:
model.save('/content/drive/MyDrive/Colab Notebooks/tweet sentiment analysis/Tweet_analysis_lstm.h5')

In [None]:
tweet=train_data.tweet[0]
p_tweet=process_tweet(tweet)
x=tokenizer.texts_to_sequences(p_tweet)
x=pad_sequences(x)
pred=model.predict(x)

In [None]:
model.predict_classes(X_test[0:5]) #top five tweets predicted 




array([0, 0, 0, 0, 0])

In [None]:
y_test[:5] # true top five tweets in dataset

array([0, 0, 0, 0, 0])