<div class="alert alert-block alert-info">
This is <b>Rank 29</b> solution using SGD Classifier and One Vs Rest Classifier. 
  
I have also tried <b>fastai</b> but it didn't give me good results even after fine-tuning it. I have also uploaded fastai solution in this folder which gave me F1-Score of 0.4676695183 on Public leaderboard which will be good reference point for using fastai for NLP problems.
</div>

# Import Libraries

In [1]:
import re
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings("ignore")

In [2]:
from nltk.corpus import stopwords
nltk_stopwords = stopwords.words('english')

# Import Datasets

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
print('No. of samples in training set:', len(train))
print('No. of samples in test set    :', len(test))

No. of samples in training set: 5279
No. of samples in test set    : 2924


# Text pre-processing

In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'@[a-zA-Z0-9_]+', '', text)   
    text = re.sub(r'https?://[A-Za-z0-9./]+', '', text)   
    text = re.sub(r'www.[^ ]+', '', text)  
    text = re.sub(r'[a-zA-Z0-9]*www[a-zA-Z0-9]*com[a-zA-Z0-9]*', '', text)  
    text = re.sub(r'[^a-zA-Z]', ' ', text)   
    text = [token for token in text.split() if len(token) > 2]
    text = ' '.join(text)
    return text

train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

# Train test split

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_cv, y_train, y_cv = train_test_split(train['text'], train['sentiment'], test_size=0.25, stratify=train['sentiment'], 
                                                random_state=1)

## Feature extraction & Model building

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, 
                             min_df=3, max_features=None, binary=False, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

In [8]:
X_train_tfidf = tfidf_vect.fit_transform(X_train)
X_cv_tfidf = tfidf_vect.transform(X_cv)

In [9]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss='log', max_iter=200, random_state=0, class_weight='balanced')
ovr = OneVsRestClassifier(sgd)
ovr.fit(X_train_tfidf, y_train)
y_pred_class = ovr.predict(X_cv_tfidf)
print('f1_score       :', f1_score(y_cv, y_pred_class, average='macro'))
print('accuracy score :', accuracy_score(y_cv, y_pred_class))

f1_score       : 0.5145775153047341
accuracy score : 0.6613636363636364


# Predict on test set

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer='word', stop_words=None, token_pattern='(?u)\\b\\w\\w+\\b', ngram_range=(1, 1), max_df=1.0, 
                             min_df=3, max_features=None, binary=False, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

In [11]:
full_text = list(train['text'].values) + list(test['text'].values)
tfidf_vect.fit(full_text)

X_train_tfidf = tfidf_vect.transform(train['text'])
X_test_tfidf = tfidf_vect.transform(test['text'])

y_train = train['sentiment']

In [12]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(loss='log', max_iter=200, random_state=0, class_weight='balanced')
ovr = OneVsRestClassifier(sgd)
ovr.fit(X_train_tfidf, y_train)
y_pred_class = ovr.predict(X_test_tfidf)
y_pred_class

array([2, 1, 2, ..., 2, 2, 2])

# Submission

In [13]:
test['sentiment'] = y_pred_class
test.drop(['text','drug'], axis=1,inplace=True)
test.head()

Unnamed: 0,unique_hash,sentiment
0,9e9a8166b84114aca147bf409f6f956635034c08,2
1,e747e6822c867571afe7b907b51f0f2ca67b0e1a,1
2,50b6d851bcff4f35afe354937949e9948975adf7,2
3,7f82ec2176ae6ab0b5d20b5ffc767ac829f384ae,2
4,8b37d169dee5bdae27060949242fb54feb6a7f7f,2


In [14]:
test['sentiment'].value_counts()

2    1958
0     487
1     479
Name: sentiment, dtype: int64

In [15]:
test.to_csv('submission.csv', index=False)