In [None]:

import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.


### **Read dataset**

In [None]:
%%time
train = pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/train.csv')
test = pd.read_csv('/kaggle/input/twitter-sentiment-analysis-hatred-speech/test.csv')

### **Preview dataset**

In [None]:
train.head()

In [None]:
test.head()

In [None]:
def num_of_words(df):
    df['word_count'] = df['tweet'].apply(lambda x : len(str(x).split(" ")))
    print(df[['tweet','word_count']].head())

In [None]:
num_of_words(train)

In [None]:
num_of_words(test)

- We can see that word counts in every tweet has been calculated above.

In [None]:
def num_of_chars(df):
    df['char_count'] = df['tweet'].str.len() ## this also includes spaces
    print(df[['tweet','char_count']].head())

In [None]:
num_of_chars(train)

In [None]:
num_of_chars(test)

In [None]:
def avg_word(sentence):
    words = sentence.split()    
    return (sum(len(word) for word in words)/len(words))

In [None]:
def avg_word_length(df):
    df['avg_word'] = df['tweet'].apply(lambda x: avg_word(x))
    print(df[['tweet','avg_word']].head())

In [None]:
avg_word_length(train)

In [None]:
avg_word_length(test)

In [None]:
import nltk
from nltk.corpus import stopwords
set(stopwords.words('english'))

- We can count the number of stopwords as follows-

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
def stop_words(df):
    df['stopwords'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
    print(df[['tweet','stopwords']].head())

In [None]:
stop_words(train)

In [None]:
stop_words(test)

In [None]:
def hash_tags(df):
    df['hashtags'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
    print(df[['tweet','hashtags']].head())

In [None]:
hash_tags(train)

In [None]:
hash_tags(test)

In [None]:
def num_numerics(df):
    df['numerics'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
    print(df[['tweet','numerics']].head())

In [None]:
num_numerics(train)

In [None]:
num_numerics(test)

In [None]:
def num_uppercase(df):
    df['upper_case'] = df['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
    print(df[['tweet','upper_case']].head())

In [None]:
num_uppercase(train)

In [None]:
num_uppercase(test)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
          'This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?',
         ]

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

In [None]:
print(X.toarray())

In [None]:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())

In [None]:
print(X2.toarray())

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
corpus = [
          'This is the first document.',
          'This document is the second document.',
          'And this is the third one.',
          'Is this the first document?',
         ]
vectorizer = HashingVectorizer(n_features=2**4)
X = vectorizer.fit_transform(corpus)
print(X.shape)

In [None]:
def lower_case(df):
    df['tweet'] = df['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    print(df['tweet'].head())

In [None]:
lower_case(train)

In [None]:
lower_case(test)

In [None]:
def punctuation_removal(df):
    df['tweet'] = df['tweet'].str.replace('[^\w\s]','')
    print(df['tweet'].head())

In [None]:
punctuation_removal(train)

In [None]:
punctuation_removal(test)

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [None]:
def stop_words_removal(df):
    df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    print(df['tweet'].head())

In [None]:
stop_words_removal(train)

In [None]:
stop_words_removal(test)

In [None]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]
freq

In [None]:
freq = list(freq.index)

In [None]:
def frequent_words_removal(df):    
    df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    print(df['tweet'].head())

In [None]:
frequent_words_removal(train)

In [None]:
frequent_words_removal(test)

In [None]:
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
freq

In [None]:
freq = list(freq.index)

In [None]:
def rare_words_removal(df):
    df['tweet'] = df['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
    print(df['tweet'].head())

In [None]:
rare_words_removal(train)

In [None]:
rare_words_removal(test)

In [None]:
from textblob import TextBlob

In [None]:
def spell_correction(df):
    return df['tweet'][:5].apply(lambda x: str(TextBlob(x).correct()))

In [None]:
spell_correction(train)

In [None]:
spell_correction(test)

In [None]:
def tokens(df):
    return TextBlob(df['tweet'][1]).words

In [None]:
tokens(train)

In [None]:
tokens(test)

In [None]:
from nltk.stem import PorterStemmer
st = PorterStemmer()

In [None]:
def stemming(df):
    return df['tweet'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

In [None]:
stemming(train)

In [None]:
stemming(test)

In [None]:
from textblob import Word

In [None]:
def lemmatization(df):
    df['tweet'] = df['tweet'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
    print(df['tweet'].head())

In [None]:
lemmatization(train)

In [None]:
lemmatization(test)

In [None]:
from textblob import TextBlob

In [None]:
def combination_of_words(df):
    return (TextBlob(df['tweet'][0]).ngrams(2))

In [None]:
combination_of_words(train)

In [None]:
combination_of_words(test)

In [None]:
def term_frequency(df):
    tf1 = (df['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
    tf1.columns = ['words','tf']
    return tf1.head()

In [None]:
term_frequency(train)

In [None]:
term_frequency(test)

In [None]:
tf1 = (train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']
tf1.head()

In [None]:
tf2 = (test['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf2.columns = ['words','tf']
tf2.head()

In [None]:
tf1 = (train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']

In [None]:
for i,word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['tweet'].str.contains(word)])))

In [None]:
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word',
 stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['tweet'])
train_vect

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(train['tweet'])
train_bow

4.7 Model

In [None]:
def polarity_subjectivity(df):
    return df['tweet'][:5].apply(lambda x: TextBlob(x).sentiment)

In [None]:
polarity_subjectivity(train)

In [None]:
def sentiment_analysis(df):
    df['sentiment'] = df['tweet'].apply(lambda x: TextBlob(x).sentiment[0] )
    return df[['tweet','sentiment']].head()

In [None]:
sentiment_analysis(train)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(train_model['tweet'].values)
X = tokenizer.texts_to_sequences(train_model['tweet'].values)
X = pad_sequences(X)

In [None]:
sentiment_analysis(test)

In [None]:
polarity_subjectivity(test)

In [None]:

train_model = train[['tweet','sentiment']]
train_model.head()

In [None]:

X_train, X_test, Y_train, Y_test = train_test_split(X,train[['sentiment']], test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

In [None]:
from keras.preprocessing.sequence import pad_sequences
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(train_model['tweet'].values)
X = tokenizer.texts_to_sequences(train_model['tweet'].values)
X = pad_sequences(X)

In [None]:
lstm_out = 196

model = Sequential()
model.add(Embedding(max_fatures, 128,input_length = 17))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,train[['sentiment']], test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

In [None]:
from keras.utils import to_categorical
train_labels = to_categorical(Y_train)
batch_size = 32
model.fit(X_train, train_labels, epochs = 7, batch_size=batch_size, verbose = 2)

In [None]:
validation_size = 32
X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
test_labels = to_categorical( Y_test)
score,acc = model.evaluate(X_test, test_labels, verbose = 2, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

In [None]:
data = {'No' : 1, 'tweet' : "this project is horrible"}
testline = pd.DataFrame(data, index=[0])
sentiment_analysis(testline)

In [None]:
x_test = pd.read_csv('/kaggle/input/xarvio/data_Field_Manager.csv')
rating = list(x_test["Rating"])
reviews =list(x_test["Review"]) 

In [None]:
predicted_review = []
for i in range (len(reviews)):
    data = {'No' : 1, 'tweet' : reviews[i]}  
    testline = pd.DataFrame(data, index=[0])
    score = sentiment_analysis(testline)
    #print((score["sentiment"]+1)*2.5 , rating[i])
    predicted_review.append(float(score["sentiment"]+1)*2.5)
    #print(score*5 , rating[i], reviews[i])


In [None]:
data = {'Review' : reviews, 'Rating' : rating , 'Prediction' : predicted_review}

Result = pd.DataFrame(data)
Result

[Go to Top](#0)	