In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import os
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# 1. Load the Dataset

In [None]:
# For this project, 
# using a different dataset than the one provided, as this is more relevant in a portfolio than a zodiac predicting classifier

# link to dataset - https://www.kaggle.com/competitions/feedback-prize-2021/data

In [4]:
raw_data = pd.read_csv('/content/drive/MyDrive/Argument_Analysis/train.csv')

In [5]:
raw_data = raw_data[:10000]

In [81]:
raw_data.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [6]:
# id - not necessary
# discourse id - not necessary
# discourse_start - will drop for this exercise
# discourse_end - will drop for this exercise
# discourse_text - main text feature
# discourse_type - target label
# discourse_type_num - will drop for this exercise
# predictionstring - will drop for this exercise

# 2. Preprocess the rows

### a. Remove unwanted characters

In [7]:
# using regex and creating an inbuilt function to apply it to the column

import re, string

def clean_string(str_data):
  str_data = re.sub(r"[^a-zA-Z0-9 ]", '', str_data)
  return str_data

clean_string('Simaran is.     noT ABLE to d2298##$@#@0')

'Simaran is     noT ABLE to d22980'

### b. Convert text to lowercase

In [8]:
def to_lower(str_data):
  str_data = str_data.lower()
  return str_data

to_lower('SIMARAN is MY naME')

'simaran is my name'

### c. Remove unwanted spaces

In [9]:
def remove_unwanted_space(str_data):
  str_data = str_data.split()
  str_data = ' '.join(str_data)
  return str_data

remove_unwanted_space('HOw    much    ')

'HOw much'

### d. Remove stopwords

In [10]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [12]:
def remove_stopwords(str_data):
  stop_words = set(stopwords.words('english'))
  
  # tokenizing the string data
  word_tokens = word_tokenize(str_data)

  # filter the data by removing stopwords
  filtered_text = [word for word in word_tokens if word not in stop_words]

  # returning an an array of filtered words
  return ' '.join(filtered_text)

remove_stopwords('This is Simaran...')

'This Simaran ...'

In [13]:
# creating a pre-processing pipeline function

def preprocess_text(str_data):
  
  str_data = clean_string(str_data)
  
  str_data = to_lower(str_data)
 
  str_data = remove_unwanted_space(str_data)
  
  str_data = remove_stopwords(str_data)
  
  return str_data

preprocess_text('What is THE greatest   \n   computer THAT has 3ver existed###??? 42##42')

'greatest computer 3ver existed 4242'

In [14]:
processed_df = raw_data.copy()

processed_df['discourse_text'] = processed_df['discourse_text'].apply(preprocess_text)

# 3. Bring dataset down to 2 columns - text and labels

In [15]:
# selecting the only discourse_text and discourse_type for this exercise

subset_df = processed_df.loc[:, ['discourse_text', 'discourse_type']]
subset_df.head()

Unnamed: 0,discourse_text,discourse_type
0,modern humans today always phone always phone ...,Lead
1,really bad consequences stuff happens comes phone,Position
2,certain areas united states ban phones class r...,Evidence
3,people phones know certain apps apps like face...,Evidence
4,driving one way get around people always phone...,Claim


In [16]:
subset_df.shape

(10000, 2)

In [17]:
subset_df['discourse_type'].value_counts()

# classes present in the target variable

Evidence                3317
Claim                   3268
Position                1076
Concluding Statement     913
Lead                     817
Counterclaim             340
Rebuttal                 269
Name: discourse_type, dtype: int64

# 4. Separate features and labels, and split the data into training and testing 

In [18]:
from sklearn.model_selection import train_test_split

X = subset_df.loc[:, 'discourse_text']
y = subset_df.loc[:, 'discourse_type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 5. Vectorize the features

### a. Create a Bag of Words using count vectorizer

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

# creating the bag of words using the 
count_vec = CountVectorizer(ngram_range=(1, 2))

# training the count vectorizer on the train data
count_vec.fit(X_train)

CountVectorizer(ngram_range=(1, 2))

In [20]:
# transforming the train and test datasets

X_train_count_vec = count_vec.transform(X_train)
X_test_count_vec = count_vec.transform(X_test)

X_train_count_vec = X_train_count_vec.toarray()
X_test_count_vec = X_test_count_vec.toarray()

print(X_train_count_vec.shape)
print(X_test_count_vec.shape)

(7000, 96054)
(3000, 96054)


### b. Print the term-document matrix

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_vec.fit(X_train)

X_train_tfidf_vec = tfidf_vec.transform(X_train)
X_test_tfidf_vec = tfidf_vec.transform(X_test)

X_train_tfidf_vec = X_train_tfidf_vec.toarray()
X_test_tfidf_vec = X_test_tfidf_vec.toarray()

print(X_train_tfidf_vec.shape)
print(X_test_tfidf_vec.shape)

(7000, 10337)
(3000, 10337)


# 6. Create a dictionary to get the count of every label 

In [24]:
dict(y_train.value_counts())

{'Claim': 2285,
 'Concluding Statement': 648,
 'Counterclaim': 240,
 'Evidence': 2284,
 'Lead': 572,
 'Position': 783,
 'Rebuttal': 188}

In [25]:
dict(y_test.value_counts())

{'Claim': 983,
 'Concluding Statement': 265,
 'Counterclaim': 100,
 'Evidence': 1033,
 'Lead': 245,
 'Position': 293,
 'Rebuttal': 81}

# 7. Transform the labels

In [26]:
# for the case of this project, instead of a MultiLabelBinarizer, we could use a LabelEncoder

from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()

label_enc.fit(y_train)

# checking the classes that have been fit
label_enc.classes_

array(['Claim', 'Concluding Statement', 'Counterclaim', 'Evidence',
       'Lead', 'Position', 'Rebuttal'], dtype=object)

In [27]:
y_train_enc = label_enc.transform(y_train)
y_test_enc = label_enc.transform(y_test)

# 8. Choose a classifier

In [34]:
pd.DataFrame(X_train_tfidf_vec).max()

0        0.406998
1        0.352070
2        0.180203
3        0.356580
4        0.475920
           ...   
10332    0.294067
10333    0.532181
10334    0.516665
10335    0.626397
10336    0.282313
Length: 10337, dtype: float64

In [42]:
# first trying a naive bayes classifier, wrapped in a OneVsRest classifier

from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier

gnb = GaussianNB()
ovr_clf_gnb = OneVsRestClassifier(estimator=gnb)

ovr_clf_gnb.fit(X_train_tfidf_vec, y_train_enc)

OneVsRestClassifier(estimator=GaussianNB())

In [44]:
y_preds_enc = ovr_clf_gnb.predict(X_test_tfidf_vec)

In [46]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, cohen_kappa_score

In [48]:
print('Accuracy for GaussianNB classifier set is: ', accuracy_score(y_test_enc, y_preds_enc))

Accuracy for GaussianNB classifier set is:  0.276


In [49]:
print('Classification report \n', classification_report(y_test_enc, y_preds_enc))

Classification report 
               precision    recall  f1-score   support

           0       0.41      0.09      0.14       983
           1       0.19      0.22      0.21       265
           2       0.55      0.06      0.11       100
           3       0.65      0.48      0.55      1033
           4       0.12      0.29      0.17       245
           5       0.28      0.29      0.28       293
           6       0.04      0.38      0.07        81

    accuracy                           0.28      3000
   macro avg       0.32      0.26      0.22      3000
weighted avg       0.43      0.28      0.30      3000



In [51]:
# next trying a Logistic Regression 

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
ovr_clf_lr = OneVsRestClassifier(estimator=log_reg)
ovr_clf_lr.fit(X_train_tfidf_vec, y_train_enc)

OneVsRestClassifier(estimator=LogisticRegression())

In [52]:
y_preds_enc = ovr_clf_lr.predict(X_test_tfidf_vec)

In [53]:
print('Accuracy for Logistic Regression Classifier set is: ', accuracy_score(y_test_enc, y_preds_enc))

Accuracy for Logistic Regression Classifier set is:  0.6523333333333333


In [54]:
print('Classification report \n', classification_report(y_test_enc, y_preds_enc))

Classification report 
               precision    recall  f1-score   support

           0       0.63      0.81      0.71       983
           1       0.75      0.30      0.43       265
           2       0.74      0.23      0.35       100
           3       0.67      0.79      0.73      1033
           4       0.65      0.31      0.42       245
           5       0.64      0.52      0.58       293
           6       0.89      0.10      0.18        81

    accuracy                           0.65      3000
   macro avg       0.71      0.44      0.48      3000
weighted avg       0.67      0.65      0.63      3000



In [55]:
# next trying using a Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(max_depth=100)

ovr_clf_dt = OneVsRestClassifier(estimator=dt)
ovr_clf_dt.fit(X_train_tfidf_vec, y_train)

OneVsRestClassifier(estimator=DecisionTreeClassifier(max_depth=100))

In [57]:
y_preds_enc = ovr_clf_dt.predict(X_test_tfidf_vec)

In [61]:
print('Accuracy for Decision Tree Classifier set is: ', accuracy_score(y_test, y_preds_enc))

Accuracy for Decision Tree Classifier set is:  0.4583333333333333


In [63]:
print('Classification report \n', classification_report(y_test, y_preds_enc))

Classification report 
                       precision    recall  f1-score   support

               Claim       0.69      0.51      0.59       983
Concluding Statement       0.24      0.23      0.23       265
        Counterclaim       0.43      0.26      0.33       100
            Evidence       0.56      0.56      0.56      1033
                Lead       0.31      0.25      0.28       245
            Position       0.21      0.46      0.29       293
            Rebuttal       0.19      0.19      0.19        81

            accuracy                           0.46      3000
           macro avg       0.38      0.35      0.35      3000
        weighted avg       0.50      0.46      0.47      3000



In [None]:
# picking the logistic Regression classifier as it's the best performing classifier

#### i. Accuracy score

In [64]:
y_preds_enc = ovr_clf_lr.predict(X_test_tfidf_vec)

In [65]:
print('The Accuracy achieved by a Logistic Classifier is: ', accuracy_score(y_test_enc, y_preds_enc))

The Accuracy achieved by a Logistic Classifier is:  0.6523333333333333


#### ii. F1 score

In [68]:
from sklearn.metrics import f1_score

print('The weighted f1 score of the Logistic Classifier is: ', f1_score(y_test_enc, y_preds_enc, average='weighted'))

The weighted f1 score of the Logistic Classifier is:  0.6263131725066086


#### iii. Average precision score

In [69]:
from sklearn.metrics import precision_score

print('The precision score of the Logistic Classifier is: ', precision_score(y_test_enc, y_preds_enc, average='weighted'))

The precision score of the Logistic Classifier is:  0.6658835338650161


#### iv. Average recall score

In [71]:
from sklearn.metrics import recall_score

print('The recall score of the Logistic Classifier is: ', recall_score(y_test_enc, y_preds_enc, average='weighted'))

The recall score of the Logistic Classifier is:  0.6523333333333333


# 10. Print true label and predicted label for any five examples 

In [80]:
# printing the labels for the top 5 instances in the test set

y_preds_trans = label_enc.inverse_transform(y_preds_enc[:5])

pd.DataFrame({'Text': X_test[:5], 
              'Actual Label': y_test[:5], 
              'Predicted Label': y_preds_trans})

Unnamed: 0,Text,Actual Label,Predicted Label
6252,avoid mishaps best drivers stay phones driving...,Evidence,Evidence
4684,teens get cars cant stay phones likely ones ge...,Evidence,Evidence
1731,texting driving become number one driving dist...,Claim,Claim
4742,prefer phone driving time put life others life...,Evidence,Evidence
4521,use cell phones driving allowed death rates ke...,Position,Claim


In [None]:
# for the top 5 instances in the test set, the predicted and actual labels are almost matching.