<a href="https://colab.research.google.com/github/raiadi96/Pytorch/blob/master/SentenceClassifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd


In [5]:
data_yelp = pd.read_table('/content/yelp_labelled.txt')
data_imdb = pd.read_table('/content/imdb_labelled.txt')
data_amazon = pd.read_table('/content/amazon_cells_labelled.txt')

#concatenate the tables in 1 list 

frames = [data_yelp, data_imdb, data_amazon]

for colname in frames:
  print(colname.columns)

# as we can see there is no colum header so first we will a column header for all the data frames

for colname in frames:
  colname.columns = ['Message',  'Target']
  print(colname.columns)

Index(['Wow... Loved this place.', '1'], dtype='object')
Index(['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ', '0'], dtype='object')
Index(['So there is no way for me to plug it in here in the US unless I go by a converter.', '0'], dtype='object')
Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')
Index(['Message', 'Target'], dtype='object')


In [6]:
keys = ['yelp', 'imdb', 'amazon']

dataset = pd.concat(frames, keys= keys)

In [7]:
dataset.head()

Unnamed: 0,Unnamed: 1,Message,Target
yelp,0,Crust is not good.,0
yelp,1,Not tasty and the texture was just nasty.,0
yelp,2,Stopped by during the late May bank holiday of...,1
yelp,3,The selection on the menu was great and so wer...,1
yelp,4,Now I am getting angry and I want my damn pho.,0


In [8]:
#check if any of the columns has null values

dataset.isnull().sum()

Message    0
Target     0
dtype: int64

In [9]:
# we will now remove the stopwords and lemmatize the text
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en')


In [12]:
stop_words = list(STOP_WORDS)
stop_words[:10]

['since',
 'whose',
 'you',
 "'d",
 'n‘t',
 'amount',
 'thereupon',
 'this',
 'almost',
 'once']

In [15]:
#create a spacy parser
from spacy.lang.en import English
parser = English()
import string

#create a custom tokenizer and stop word removal method

def spacy_tokenizer(sentence):
  myToken = parser(sentence)
  myToken = [word.lemma_.lower().strip() if word.lemma_ != '--PRON--' else wprd.lower_ for word in myToken]
  myToken =  [word  for word in myToken if word not in stop_words and word not in string.punctuation]
  return myToken

In [23]:
#import sklearn packages
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC

In [24]:
class CustomTransformer(TransformerMixin):
  def transform(self, X, **transform_params):
    return [clean_text(text) for text in X]
  
  def fit(self, X, y = None, **fit_params):
    return self
  
  def get_params(self, deep = True):
    return {}

def clean_text(sent):
  return sent.strip().lower()

In [27]:
vectorizer = CountVectorizer(tokenizer= spacy_tokenizer, ngram_range= (1,1))
classifier = LinearSVC()

#tfidf vectorizer
tfidf_vectorizer = TfidfVectorizer(tokenizer= spacy_tokenizer)

In [29]:
X, y  = dataset['Message'],dataset['Target']

In [30]:
from sklearn.model_selection import train_test_split
X_train, X_text, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 123)


In [31]:
pipe = Pipeline([
                 ("cleaner", CustomTransformer()),
                 ("vectorizer", vectorizer),
                 ("classifier", classifier)
])

In [32]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('cleaner',
                 <__main__.CustomTransformer object at 0x7f9e9026ba58>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x7f9e9d3dfe18>,
                                 vocabulary=None)),
                ('classifier',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_sca

In [33]:
count_vectorizer_predictions = pipe.predict(X_text)

In [34]:
count_vectorizer_predictions

array([0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,