# For Train Data

## Load Data

In [1]:
import pandas as pd

In [2]:
train = pd.read_csv("train.csv", index_col="id")
train.head()

Unnamed: 0_level_0,label,tweet
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,@user when a father is dysfunctional and is s...
2,0,@user @user thanks for #lyft credit i can't us...
3,0,bihday your majesty
4,0,#model i love u take with u all the time in ...
5,0,factsguide: society now #motivation


In [3]:
X = train.tweet
X.head()

id
1     @user when a father is dysfunctional and is s...
2    @user @user thanks for #lyft credit i can't us...
3                                  bihday your majesty
4    #model   i love u take with u all the time in ...
5               factsguide: society now    #motivation
Name: tweet, dtype: object

In [4]:
y = train.label
y.head()

id
1    0
2    0
3    0
4    0
5    0
Name: label, dtype: int64

## Split data into train and test

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Perform filtering word

In [6]:
from nltk.corpus import stopwords
useful_words = [word for word in X_train if word not in stopwords.words("english")]
useful_words[:5]

["@user @user sorry guys, i've just taken a therapy room in scunthorpe so i have lots to do #decorating   #newdirection",
 "@user loved tonight's interview - two great guys having easy fun - just what we all need after all the bad news at the weekend  ",
 'my album "a challager approaches" drops in 2 days! # getready   #thatreal #dope',
 'ð\x9f\x98\x84 #smile #smiling top.tags #toptags #smiles #beautifulsmile #smiley #smilee #pretty  â\x80¦ ',
 'true.    teens poems ness quotes #hu #alone #lost #brokenquotes #brokenâ\x80¦ ']

## Create the prerequisites for model

In [7]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stopwords 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from nltk.tokenize import RegexpTokenizer
import re


import string
punctuations = string.punctuation
digits = string.hexdigits


from spacy.lang.en import English
parser = English()

#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic utility function to clean the text 
def clean_text(text):     
    return text.strip().lower()

#Create spacy tokenizer that parses a sentence and generates tokens
#these can also be replaced by word vectors 
def spacy_tokenizer(sentence):
    sentence = re.sub('[^A-Za-z0-9]+', ' ', sentence)
    tokens = parser(sentence)
    tokens = [tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_ for tok in tokens]
    tokens = [tok for tok in tokens if (tok not in stopwords and tok not in punctuations 
                                        and tok not in digits)]     
    return tokens

#create vectorizer object to generate feature vectors, we will use custom spacy’s tokenizer
vectorizer = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(-1,1)) 

classifier = LinearSVC()

## Intialize model

In [8]:
# Create the  pipeline to clean, tokenize, vectorize, and classify 
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])

## Model fit

In [9]:
# Train Model
pipe.fit(X_train,y_train) 

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x7f9e19c52908>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ng...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

## Predict and calculate accuracy

In [10]:
y_pred = pipe.predict(X_test)
from sklearn.metrics import accuracy_score
res = accuracy_score(y_test,y_pred)
res #0.9595183921122488

0.9575274933636708

# For Test data

In [11]:
test = pd.read_csv("test.csv")
test.head()
y_test_res = test.tweet.tolist()

In [12]:
y_test_res[:5]

['#studiolife #aislife #requires #passion #dedication #willpower   to find #newmaterialsâ\x80¦ ',
 ' @user #white #supremacists want everyone to see the new â\x80\x98  #birdsâ\x80\x99 #movie â\x80\x94 and hereâ\x80\x99s why  ',
 'safe ways to heal your #acne!!    #altwaystoheal #healthy   #healing!! ',
 'is the hp and the cursed child book up for reservations already? if yes, where? if no, when? ð\x9f\x98\x8dð\x9f\x98\x8dð\x9f\x98\x8d   #harrypotter #pottermore #favorite',
 '  3rd #bihday to my amazing, hilarious #nephew eli ahmir! uncle dave loves you and missesâ\x80¦ ']

In [13]:
result =  pipe.predict(y_test_res)
result[:100]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [14]:
output = pd.DataFrame({'label':result},index =test.id)
output.to_csv("res.csv")