## Importing required packages 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

## Loading file 

In [3]:
df = pd.read_csv(file_path)

In [4]:
len(df)

100

In [5]:
df.head()

Unnamed: 0,No,Sentences,Subject of comparison,Object of comparison,Nature,Categories,Feature matching,Symmetry,Salience,"Broad, unifying concept",Domain incongruence,Gold
0,1,Tom is as fast as a leopard.,Tom,a leopard,Qualitative,"Different basic level category (human, animal)",More prominent differences,Asymmetrical,High salience in object of comparison,Meaningful,Distinct,Simile
1,2,"But he’s no fool, even if he is as obstinate a...",He,a mule,Qualitative,"Different basic level category (human, animal)",More prominent differences,Asymmetrical,High salience in object of comparison,Meaningful,Distinct,Simile
2,3,I was as cool as a cucumber.,I,cucumber,Qualitative,"Different basic level category (human, fruit)",More prominent differences,Asymmetrical,High salience in object of comparison,Meaningful,Distinct,Simile
3,4,He paid as much as a million dollars for the p...,He,-,-,-,-,-,-,-,-,Not Applicable
4,5,The wicked flee when no one pursues; but the r...,Righteous,a lion,Qualitative,"Different basic level category (human, animal)",More prominent differences,Asymmetrical,High salience in object of comparison,Meaningful,Distinct,Simile


## Preparing the dataset

In [6]:
df= df[["Sentences", "Gold"]]

In [7]:
df

Unnamed: 0,Sentences,Gold
0,Tom is as fast as a leopard.,Simile
1,"But he’s no fool, even if he is as obstinate a...",Simile
2,I was as cool as a cucumber.,Simile
3,He paid as much as a million dollars for the p...,Not Applicable
4,The wicked flee when no one pursues; but the r...,Simile
...,...,...
95,He always keeps his room as neat as a pin.,Simile
96,He is as stubborn as a Missouri Mule.,Simile
97,The surface was as flat as a mirror.,Comparison
98,"It allows very poor people, who don't qualify ...",Not Applicable


In [8]:
print(df['Gold'].unique())

['Simile' 'Not Applicable' 'Comparison']


## Coverting into labels 

In [9]:
new_df= df.copy()

In [10]:
new_df.loc[new_df['Gold'] == 'Simile', 'Gold'] = 0
new_df.loc[new_df['Gold'] == 'Comparison', 'Gold'] = 1
new_df.loc[new_df['Gold'] == 'Not Applicable', 'Gold'] = 2

In [11]:
new_df

Unnamed: 0,Sentences,Gold
0,Tom is as fast as a leopard.,0
1,"But he’s no fool, even if he is as obstinate a...",0
2,I was as cool as a cucumber.,0
3,He paid as much as a million dollars for the p...,2
4,The wicked flee when no one pursues; but the r...,0
...,...,...
95,He always keeps his room as neat as a pin.,0
96,He is as stubborn as a Missouri Mule.,0
97,The surface was as flat as a mirror.,1
98,"It allows very poor people, who don't qualify ...",2


## Splitting into train and test 

In [12]:
X_train, X_test, y_train, y_test = train_test_split(new_df['Sentences'], new_df['Gold'], test_size= 0.2, random_state= 42)

Material on Understanding Tf-Idf - https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/

Material on Understanding BOW- https://www.geeksforgeeks.org/bag-of-word-and-frequency-count-in-text-using-sklearn/?ref=next_article_top

## Vectorization using Bag-Of-Words with word unigrams, bigrams and character unigrams and bigrams

In [13]:
word_vectorizer= CountVectorizer(ngram_range=(1,2), analyzer='word')
char_vectorizer= CountVectorizer(ngram_range=(1,2), analyzer='char')

## Fit and transform 

Source for more on fit(), transform() and fit_transform()- https://www.analyticsvidhya.com/blog/2021/04/difference-between-fit-transform-fit_transform-methods-in-scikit-learn-with-python-code/

In [19]:
word_vectorizer.fit(X_train)

CountVectorizer(ngram_range=(1, 2))

In [25]:
word_vectorizer.vocabulary_

{'heard': 644,
 'her': 656,
 'speaking': 1203,
 'english': 465,
 'as': 94,
 'fluently': 523,
 'an': 43,
 'american': 40,
 'heard her': 646,
 'her speaking': 660,
 'speaking english': 1204,
 'english as': 466,
 'as fluently': 131,
 'fluently as': 524,
 'as an': 98,
 'an american': 44,
 'he': 621,
 'is': 714,
 'nutty': 932,
 'fruitcake': 557,
 'he is': 628,
 'is as': 715,
 'as nutty': 172,
 'nutty as': 933,
 'as fruitcake': 137,
 'this': 1349,
 'child': 332,
 'gentle': 562,
 'lamb': 772,
 'today': 1380,
 'this child': 1350,
 'child is': 333,
 'as gentle': 138,
 'gentle as': 563,
 'as lamb': 152,
 'lamb today': 773,
 'frederick': 542,
 'looked': 824,
 'white': 1461,
 'sheet': 1159,
 'said': 1128,
 'one': 972,
 'old': 962,
 'woman': 1490,
 'and': 50,
 'the': 1269,
 'crowd': 395,
 'parted': 1013,
 'gentry': 564,
 'carriage': 322,
 'drove': 436,
 'out': 992,
 'of': 943,
 'yard': 1510,
 'frederick looked': 544,
 'looked as': 825,
 'as white': 219,
 'white as': 1462,
 'as sheet': 193,
 'sheet 

In [20]:
char_vectorizer.fit(X_train)

CountVectorizer(analyzer='char', ngram_range=(1, 2))

In [26]:
char_vectorizer.vocabulary_

{'i': 222,
 ' ': 0,
 'h': 209,
 'e': 150,
 'a': 76,
 'r': 363,
 'd': 132,
 's': 390,
 'p': 345,
 'k': 248,
 'n': 296,
 'g': 192,
 'l': 259,
 'f': 179,
 'u': 436,
 't': 415,
 'y': 473,
 'm': 280,
 'c': 115,
 '.': 59,
 'i ': 223,
 ' h': 11,
 'he': 214,
 'ea': 156,
 'ar': 91,
 'rd': 371,
 'd ': 133,
 'er': 170,
 'r ': 364,
 ' s': 22,
 'sp': 407,
 'pe': 351,
 'ak': 86,
 'ki': 252,
 'in': 235,
 'ng': 308,
 'g ': 193,
 ' e': 8,
 'en': 167,
 'gl': 202,
 'li': 268,
 'is': 239,
 'sh': 400,
 'h ': 210,
 ' a': 4,
 'as': 92,
 's ': 391,
 ' f': 9,
 'fl': 186,
 'lu': 276,
 'ue': 442,
 'nt': 315,
 'tl': 427,
 'ly': 279,
 'y ': 474,
 'an': 89,
 'n ': 297,
 'am': 88,
 'me': 288,
 'ri': 376,
 'ic': 227,
 'ca': 118,
 'n.': 301,
 'e ': 151,
 ' i': 12,
 ' n': 17,
 'nu': 316,
 'ut': 451,
 'tt': 432,
 'ty': 435,
 'a ': 77,
 'fr': 188,
 'ru': 386,
 'ui': 444,
 'it': 240,
 'tc': 423,
 'ke': 251,
 'e.': 155,
 'b': 99,
 'o': 321,
 'th': 425,
 'hi': 215,
 ' c': 6,
 'ch': 122,
 'il': 233,
 'ld': 265,
 ' g': 10,
 '

In [28]:
# training data

X_train_word = word_vectorizer.transform(X_train)
X_train_char= char_vectorizer.transform(X_train)

# test data

X_test_word = word_vectorizer.transform(X_test)
X_test_char= char_vectorizer.transform(X_test)

In [29]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

## SVM model training on word unigrams and bigrams

In [30]:
model = SVC(C=1.0, kernel='linear')

model.fit(X_train_word, y_train)

SVC(kernel='linear')

## Accuracy on test set 

In [31]:
predictions = model.predict(X_test_word)

In [32]:
print ("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.75


## SVM model training on character unigrams and bigrams 

In [34]:
model = SVC(C=1.0, kernel='linear')

model.fit(X_train_char, y_train)

SVC(kernel='linear')

## Accuracy on test set 

In [35]:
predictions = model.predict(X_test_char)

In [36]:
print ("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.55


## Vectorization using Tf-Idf with word unigrams, bigrams 

In [39]:
word_vectorizer= TfidfVectorizer(ngram_range=(1,2), analyzer='word')

## Fit and transform 

In [40]:
word_vectorizer.fit(X_train)

TfidfVectorizer(ngram_range=(1, 2))

In [234]:
# training data

X_train_word = word_vectorizer.transform(X_train)

# test data

X_test_word = word_vectorizer.transform(X_test)

In [235]:
y_train = y_train.astype('int')
y_test = y_test.astype('int')

## SVM model training  

In [236]:
model = SVC(C=1.0, kernel='linear')

model.fit(X_train_word, y_train)

SVC(kernel='linear')

## Accuracy on test set 

In [237]:
predictions = model.predict(X_test_word)

In [238]:
print ("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.75
