<a href="https://colab.research.google.com/github/shravya0108/Diabetes-Prediction-Data-Science-Project-/blob/main/Counterfactual_Text_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 Recognizing Counterfactual Statements (RCS) -- Determine whether a given sentence is counterfactual or not.


1 Loading the data set from Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


2 Importing all the necessary libraries

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score


In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

3 Generating random 500 integer numbers

In [None]:
np.random.seed(500)

4 Reading csv file

In [None]:
from nltk import corpus
path='/content/drive/MyDrive/dataset.csv'
corpus = pd.read_csv(path, encoding='utf-8')

5 Printing the path of csv file

In [None]:
print("File: %s" % path)

File: /content/drive/MyDrive/dataset.csv


In [None]:
percent = 0.3 	# 0.3 for testing




6 Data cleaning by dropping null values and converting text to lowercase

In [None]:
corpus['sentence'].dropna(inplace=True)
corpus['sentence'] = [sent.lower() for sent in corpus['sentence']]

In [None]:
print(corpus['sentence'])

0       unfortunately, letting them behave badly -- be...
1       if pepfar ended antiretroviral coverage to a t...
2       republicans may not have read their bill befor...
3       people close to mr. trump have suggested that ...
4       this must all happen fast if policies are to t...
                              ...                        
6995    recruiters said that deutsche's approach was l...
6996    if they include agriculture, said ms malmstrm ...
6997    alcoa may have put to rest fears that earnings...
6998    if the new rules are approved by the eu, lives...
6999    if they win and the affordable care act, or pi...
Name: sentence, Length: 7000, dtype: object


7 Tokenization of sentence to words

In [None]:
corpus['sentence'] = [word_tokenize(word) for word in corpus['sentence']]

In [None]:
print(corpus)

      sentenceID  gold_label  \
0         113000           0   
1         113001           0   
2         113002           0   
3         113003           0   
4         113004           0   
...          ...         ...   
6995      119995           0   
6996      119996           0   
6997      119997           0   
6998      119998           0   
6999      119999           0   

                                               sentence  
0     [unfortunately, ,, letting, them, behave, badl...  
1     [if, pepfar, ended, antiretroviral, coverage, ...  
2     [republicans, may, not, have, read, their, bil...  
3     [people, close, to, mr., trump, have, suggeste...  
4     [this, must, all, happen, fast, if, policies, ...  
...                                                 ...  
6995  [recruiters, said, that, deutsche, 's, approac...  
6996  [if, they, include, agriculture, ,, said, ms, ...  
6997  [alcoa, may, have, put, to, rest, fears, that,...  
6998  [if, the, new, rules, are, ap

In [None]:
print(len(corpus))

7000


In [None]:
print(corpus['sentence'])

0       [unfortunately, ,, letting, them, behave, badl...
1       [if, pepfar, ended, antiretroviral, coverage, ...
2       [republicans, may, not, have, read, their, bil...
3       [people, close, to, mr., trump, have, suggeste...
4       [this, must, all, happen, fast, if, policies, ...
                              ...                        
6995    [recruiters, said, that, deutsche, 's, approac...
6996    [if, they, include, agriculture, ,, said, ms, ...
6997    [alcoa, may, have, put, to, rest, fears, that,...
6998    [if, the, new, rules, are, approved, by, the, ...
6999    [if, they, win, and, the, affordable, care, ac...
Name: sentence, Length: 7000, dtype: object


8 Applying lemmatization for words and removing stop words

In [None]:
tag_maps = defaultdict(lambda: wn.NOUN)
tag_maps['J'] = wn.ADJ
tag_maps['V'] = wn.VERB
tag_maps['R'] = wn.ADV
for index, entry in enumerate(corpus['sentence']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word, tag_maps[tag[0]])
            Final_words.append(word_Final)
    corpus.loc[index, 'sentence_final'] = str(Final_words)
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(corpus['sentence_final'], corpus['gold_label'], test_size=percent)
print(corpus.loc[index, 'sentence_final'])

print(">> Feature generation...")
Encoder = LabelEncoder()
Train_Y = Encoder.fit_transform(Train_Y)
Test_Y = Encoder.fit_transform(Test_Y)
print(Train_X)
print(Train_Y)

Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(corpus['sentence_final'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)
print(Train_X_Tfidf)
print(Test_X_Tfidf)

print(">> SVM classifier....")
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X_Tfidf, Train_Y)
predictions_SVM = SVM.predict(Test_X_Tfidf)
print("SVM Accuracy Score -> ", accuracy_score(Test_Y, predictions_SVM) * 100)
print("SVM Precision Score -> ", precision_score(Test_Y, predictions_SVM) * 100)
print("SVM Recall Score -> ", recall_score(Test_Y, predictions_SVM) * 100)
print("SVM F1 Score -> ", f1_score(Test_Y, predictions_SVM) * 100)


['win', 'affordable', 'care', 'act', 'piece', 'fall', 'estimate', 'million', 'american', 'lose', 'coverage']
>> Feature generation...
4161    ['expect', 'inflation', 'go', 'back', 'normal'...
3973    ['even', 'though', 'bill', 'perfect', 'first',...
6101                                 ['proactive', 'say']
4601    ['lever', 'play', 'run', 'constrain', 'say', '...
783     ['one', 'slogan', 'think', 'clinch', 'last', '...
                              ...                        
6935    ['matter', 'many', 'veteran', 'may', 'receive'...
5113    ['one', 'day', 'prosecutor', 'charge', 'assang...
2022    ['biden', 'byron', 'suggest', 'also', 'come', ...
1996    ['additional', 'resource', 'bring', 'bear', 's...
6598    ['still', 'case', 'zadvydas', 'davis', 'majori...
Name: sentence_final, Length: 4900, dtype: object
[0 0 1 ... 0 0 0]
  (0, 4973)	0.10325022106987664
  (0, 3042)	0.32532283686373625
  (0, 2605)	0.2531248468306388
  (0, 2301)	0.568118526295317
  (0, 1976)	0.18571784695593793
  (

In [None]:
print(tag_maps)

defaultdict(<function <lambda> at 0x7fe2e6904b00>, {'J': 'a', 'V': 'v', 'R': 'r', 'N': 'n', 'W': 'n', 'M': 'n', 'I': 'n', 'F': 'n', 'C': 'n', 'P': 'n', 'D': 'n', 'E': 'n', 'S': 'n', '$': 'n', 'U': 'n', 'T': 'n', "'": 'n'})
