In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score, make_scorer
from collections import Counter
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
train_data = pd.read_csv('/kaggle/input/twitter-sentiment-analysis-analytics-vidya/train_E6oV3lV.csv')
test_data = pd.read_csv('/kaggle/input/twitter-sentiment-analysis-analytics-vidya/test_tweets_anuFYb8.csv')
train_data

In [None]:
y_train = train_data['label']
StopWords = set(stopwords.words('english'))

def preprocess(text):
    text = ' '.join([word.lower() for word in text.split() if word.lower() not in StopWords and 'https' not in word and 'http' not in word and '@' not in word])
    text = text.replace('#','')
    trans = str.maketrans('','',string.punctuation)
    text.translate(trans)
    text = re.sub('[^A-Za-z ]+', '', text)
    return text

X_train = []
X_test = []
for text in train_data['tweet']:
    X_train.append(preprocess(text))

for text in test_data['tweet']:
    X_test.append(preprocess(text))

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize(data):
    lem_data = []
    max_len = 0
    for text in data:
        lem_text = ''
        text_len = 0
        for word in text.split():
            word = lemmatizer.lemmatize(word)
            word = lemmatizer.lemmatize(word, pos='v')
            lem_text = lem_text + ' ' + word
            text_len = text_len + 1
        lem_data.append(lem_text)
        max_len = max(max_len, text_len)
        
    return lem_data, max_len

X_train_lem, max_len = lemmatize(X_train)
print(max_len)
X_test_lem, _ = lemmatize(X_test)

In [None]:
labels = Counter(y_train)
print(labels)
plt.bar(labels.keys(), labels.values())
ticks = [0,1]
classes = ['Not hate speech', 'Hate speech']
plt.xticks(ticks, classes)
plt.xlabel('Sentiment')
plt.ylabel('Frequency')

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,3), min_df=5, max_df=0.8)
X_train_tfidf = tfidf.fit_transform(X_train_lem)
X_test_tfidf = tfidf.transform(X_test_lem)

clf = RandomForestClassifier()
param_grid = {'n_estimators':[200,300,400], 'max_depth':[20,30,None], 'criterion':['gini','entropy']}
grid = GridSearchCV(estimator=clf, param_grid=param_grid, n_jobs=-1, cv=5, scoring=make_scorer(f1_score))
cv_res = grid.fit(X_train_tfidf, y_train)
print(cv_res.cv_results_)
print(cv_res.best_params_)

In [None]:
best_clf = cv_res.best_estimator_
best_clf.fit(X_train_tfidf, y_train)
y_pred = best_clf.predict(X_test_tfidf)
y_pred

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train_lem)
train_seq = tokenizer.texts_to_sequences(X_train_lem)
train_pad = pad_sequences(train_seq, maxlen = max_len)
test_seq = tokenizer.texts_to_sequences(X_test_lem)
test_pad = pad_sequences(test_seq, maxlen = max_len)
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 300

In [None]:
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_len),
    keras.layers.Bidirectional(keras.layers.LSTM(100, return_sequences=True)),
    keras.layers.Dropout(0.2),
    keras.layers.Bidirectional(keras.layers.LSTM(100)),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(embedding_dim, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=0.01), metrics=['accuracy'])
history = model.fit(train_pad, y_train, epochs=30, batch_size=512)

In [None]:
model.summary()


In [None]:
y_pred = model.predict_classes(test_pad)
y_pred

In [None]:
targets = pd.DataFrame()
targets['id'] =  test_data['id']
targets['label'] = y_pred
targets.to_csv('Predictions.csv', index=False)