In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/train.csv')
test  = pd.read_csv('../input/twitter-sentiment-analysis-hatred-speech/test.csv')

In [None]:
train.describe()

In [None]:
train[::10].head()

In [None]:
test[::10].head()

In [None]:
train[train['tweet'].str.contains(">")].head()

In [None]:
print('Negative class count in train dataset: ', len(train[train['label'] == 1]))
print('Neutral class count in train dataset: ', len(train[train['label'] == 0]))

In [None]:
text_train = train.tweet
text_test = test.tweet
print(text_train)
y_train = train.label


# Applying model Bag-of-words for the list of tweets

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))


In [None]:
# lets look at the vocabulary:
feature_names = vect.get_feature_names()
print("Features quantity: {}".format(len(feature_names)))
print("First 20 features:\n {}".format(feature_names[:20]))
print("Each 100th feature:\n {}".format(feature_names[::100]))

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Average accuracy in cross val: {:.2f}".format(np.mean(scores)))

lets we try to improve it using GridSearch:

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.001,0.01,0.1,1,10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("The best value for cross val: {:.2f}".format(grid.best_score_))
print("The best parameters: ", grid.best_params_)

result 0.96 is the same as in previous step..

Let we set minimum of documents where each token appears:

In [None]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train with min_df:\n{}".format(repr(X_train)))

In [None]:
feature_names = vect.get_feature_names()
print("Features quantity: {}".format(len(feature_names)))
print("First 20 features:\n {}".format(feature_names[:20]))
print("Each 100th feature:\n {}".format(feature_names[::100]))

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("The best value for cross val: {:.2f}".format(grid.best_score_))
print("The best parameters: ", grid.best_params_)

result 0.96 is the same as in previous step.. Probably parameter min_df doesn't metter for the dataset.  

# Stop words
Lets improve results by using stop-words removal.

In [None]:
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
X_train = vect.transform(text_train)
print("X_train with stop words removing :\n{}".format(repr(X_train)))

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("The best value for cross val: {:.2f}".format(grid.best_score_))
print("The best parameters: ", grid.best_params_)

The best value the same - 0.96. Stop words remaval also doesn't matter for result improving.

# TF-IDF 
Now lets try to use TF-IDF scaling and look how it can improve results

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None), LogisticRegression())
param_grid = {'logisticregression__C': [0.001,0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print("The best result: {:.2f}".format(grid.best_score_))

The result is the same again.
But let we look at the most relevant and the least relevant tokens: 

In [None]:
vectorizer = grid.best_estimator_.named_steps["tfidfvectorizer"]
X_train = vectorizer.transform(text_train)
max_value = X_train.max(axis=0).toarray().ravel()
sorted_by_tfidf = max_value.argsort()
feature_names = np.array(vectorizer.get_feature_names())
print("Features with minimum value of tfidf: \n{}".format(feature_names[sorted_by_tfidf[:100]]))
print("Features with maximum value of tfidf: \n{}".format(feature_names[sorted_by_tfidf[-100:]]))

In [None]:
sorted_by_idf = np.argsort(vectorizer.idf_)
print("Features with minimum value idf:\n{}".format(feature_names[sorted_by_idf[:100]]))