Import libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


Loading data

In [2]:
df_train = pd.read_csv('datasets_897_1638_Sheet_1.csv')
df_train.head()

Unnamed: 0,response_id,class,response_text,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,response_1,not_flagged,I try and avoid this sort of conflict,,,,,
1,response_2,flagged,Had a friend open up to me about his mental ad...,,,,,
2,response_3,flagged,I saved a girl from suicide once. She was goin...,,,,,
3,response_4,not_flagged,i cant think of one really...i think i may hav...,,,,,
4,response_5,not_flagged,Only really one friend who doesn't fit into th...,,,,,


In [3]:
from io import StringIO
col = ['class', 'response_text']

df_train = df_train[col]
df_train = df_train[pd.notnull(df_train['response_text'])]

df_train.columns = ['class', 'response_text']

df_train['category_id'] = df_train['class'].factorize()[0]

category_id_df_train = df_train[['class', 'category_id']].drop_duplicates().sort_values('category_id')

category_to_id = dict(category_id_df_train.values)

id_to_category = dict(category_id_df_train[['category_id', 'class']].values)

df_train.head()

Unnamed: 0,class,response_text,category_id
0,not_flagged,I try and avoid this sort of conflict,0
1,flagged,Had a friend open up to me about his mental ad...,1
2,flagged,I saved a girl from suicide once. She was goin...,1
3,not_flagged,i cant think of one really...i think i may hav...,0
4,not_flagged,Only really one friend who doesn't fit into th...,0


In [4]:
df_train.shape

(80, 3)

In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 79
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   class          80 non-null     object
 1   response_text  80 non-null     object
 2   category_id    80 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


Tokening the data with spacy

In [17]:
import string
import spacy
spacy.load("en_core_web_sm")
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

Create list of punctuation marks

In [27]:

punctuations = string.punctuation
#create list of stopwords
nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#load English tokenizer, tagger, parser, NER and word vectors

parser = English()

#creating tokenizer function
def spacy_tokenizer(sentence):
    #creating token object, which is used to create documents with linguistic anntations
    mytokens = parser(sentence)
    
    #Lemmatizing each token
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    
    
    #Removing stopwords
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
    
    
    
    return mytokens



Defining a custom transformer

In [28]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

In [29]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    
    def fit(self, X, y = None, **fit_params):
        return self
    
    def get_params(self, deep = True):
        return {}
    
    
    
#Basic function to clean the text
def clean_text(text):
    
    return text.strip().lower()

Vectorization Feature Engeneering

In [30]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range = (1,1))

Splitting the data into training and test sets

In [31]:
from sklearn.model_selection import train_test_split
X = df_train['response_text']
ylabels = df_train['category_id']

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size = 0.3)

Creating a pipeline and generating the model

the pipeline components using fit()

Logistic Regression Classifier

In [32]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

#Create pipeline using Bag of words

pipe = Pipeline([("cleaner", predictors()),
                ('vectorizer', bow_vector),
                ('classifier', classifier)])

#model generation 

pipe.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7f83b3bc5750>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 t...\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x7f83b338b710>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
            

Evaluating the module

Let's take a look at how our model actually performs, using the metrics modele

In [35]:
from sklearn import metrics
#Predicting with a test dataset
predicted = pipe.predict(X_test)

#Model accuracy

print("Logistic Regression Accuracy:", metrics.accuracy_score(y_test, predicted))

print("Logistic Regression Precision:", metrics.precision_score(y_test, predicted))

print("Logistic Regression Recall:", metrics.recall_score(y_test, predicted))




Logistic Regression Accuracy: 0.75
Logistic Regression Precision: 1.0
Logistic Regression Recall: 0.25
