In [100]:
# Importing the Libraries
!pip install --quiet googletrans

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
import time
from string import punctuation
from nltk.util import ngrams
from nltk import word_tokenize
from collections import Counter
from spacy import displacy
from tqdm import tqdm
import warnings
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import RegexpTokenizer

In [101]:
def preprocess_text(text,ptype='english'):
#     if ptype == 'premise':
#         utterances = text.split("##")
#         without_speaker = []
#         for utterance in utterances:
#             if(len(utterance.strip())==0):
#                 continue
            
#             utr_split = utterance.split(":")
#             if len(utr_split) == 1:
#                 without_speaker.append(utr_split[0])
#             else:
#                 without_speaker.append(utr_split[1])
#         result = " ".join(without_speaker)
#     else:
#         result = text
    text = text.replace("##",' ')
    text = text.replace(":",' ')
    result = text
    result = result.lower()
    cleanr = re.compile('<.*?>')
    result = re.sub(cleanr, '', result)
    result=re.sub(r'http\S+', '',result)
    #result = re.sub('[0-9]+', '', result)
    
    if ptype != 'hindi':
        tokenizer = RegexpTokenizer(r'\w+')
        result = tokenizer.tokenize(result)
        result = " ".join(result)
    
    return result

def label_encode(text):
    if text == 'entailment':
        return 0
    return 1

In [102]:
english_train_raw_df = pd.read_csv("data/english_train.csv")
english_test_raw_df = pd.read_csv("data/english_test.csv")

In [103]:
hindi_train_raw_df = pd.read_csv("data/translated_train.csv")
hindi_test_raw_df = pd.read_csv("data/translated_test.csv")

In [104]:
df_train = pd.concat([hindi_train_raw_df,english_train_raw_df[['english_premise','english_hypothesis']]],axis=1)
df_test = pd.concat([hindi_test_raw_df,english_test_raw_df[['english_premise','english_hypothesis']]],axis=1)

In [105]:
df_train = df_train.rename(columns={"translated_premise": "hindi_premise", "translated_hypothesis": "hindi_hypothesis"}, errors="raise")
df_test = df_test.rename(columns={"translated_premise": "hindi_premise", "translated_hypothesis": "hindi_hypothesis"}, errors="raise")

In [106]:
df_test.drop(columns=['Unnamed: 0'],inplace=True)
df_train.drop(columns=['Unnamed: 0'],inplace=True)

In [107]:
df_test.head()

Unnamed: 0,premise,hypothesis,labels,hindi_premise,hindi_hypothesis,english_premise,english_hypothesis
0,RANI : Woh Chaddha uncle keh rahein the ki mai...,RANI ka job karna mummy ko theek nahi lagega,entailment,रानी: वो चड्ढा अंकल कह रहे हैं की मैं उनके वहा...,रानी का जॉब करना मम्मी को ठीक नहीं लगेगा,Rani: That Chaddha uncle is saying that I shou...,Mummy will not like to do queen's job
1,BAUJI : Arre yeh chutney chakho .. ## MOTHER :...,Mom ro rahi hai,entailment,बाउजी : अरे ये चटनी चाखो.. ## मां: नहीं नहीं ह...,माँ रो रही है,"Bauji: Hey taste this chutney.. ## Mother: No,...",mom is crying
2,"SARA : Mei n t oh rah oon gi na hin , l eki n ...",DEEPAK ko bhi aisa feel ho raha hai ki ek aur ...,entailment,"सारा : मेई एन टी ओह रहूं गी न हिन, ल एक न तुम्...",दीपक को भी ऐसा महसूस होता है कि एक और नामांकित...,"Sara: Mei nt oh rahoon gi na hin, it would hav...",Deepak too feels that it would have been bette...
3,SHAHID : Nahin kiya . ## RAJESH : Ruthe ho ? #...,RAJESH ne SHAHID se kaha ki tum Prachand murkh...,entailment,शाहिद: ​​नहीं किया। ## राजेश : रूठे हो? ## शाह...,राजेश ने शाहिद से कहा की तुम प्रचंड मुड़ नहीं हो।,Shahid: Didn't. ## Rajesh: Are you angry? ## S...,Rajesh told Shahid that you are not very twisted.
4,KHALUJAN : Babban ..? ## BABBAN : Haan ! Poore...,KHALUJAN ne kidnapping karwaai hai,entailment,खलुजान : बब्बन..? ## बब्बन : हां ! पूरे प्लान ...,खालजान ने अपहरण करवाई है,Khalujan: Babban..? ## Babban: Yes! You are bu...,Khaljan has kidnapped


In [108]:
combined_df = pd.concat([df_test,df_train],ignore_index=True)

In [109]:
#['premise','hypothesis','hindi_premise','hindi_hypothesis','english_premise','english_hypothesis']
combined_df['premise'] = combined_df['premise'].apply(lambda x: preprocess_text(x))
combined_df['hypothesis'] = combined_df['hypothesis'].apply(lambda x: preprocess_text(x))
combined_df['hindi_premise'] = combined_df['hindi_premise'].apply(lambda x: preprocess_text(x,'hindi'))
combined_df['hindi_hypothesis'] = combined_df['hindi_hypothesis'].apply(lambda x: preprocess_text(x,'hindi'))
combined_df['english_premise'] = combined_df['english_premise'].apply(lambda x: preprocess_text(x))
combined_df['english_hypothesis'] = combined_df['english_hypothesis'].apply(lambda x: preprocess_text(x))

In [110]:
combined_df['labels'] = combined_df['labels'].apply(label_encode)

In [111]:
df_train, df_test = train_test_split(
    combined_df,
    test_size = 0.2,
    random_state = 0
    )

In [112]:
df_train.shape

(1791, 7)

In [113]:
df_test.shape

(448, 7)

In [95]:
combined_df.head()

Unnamed: 0,premise,hypothesis,labels,hindi_premise,hindi_hypothesis,english_premise,english_hypothesis
0,rani woh chaddha uncle keh rahein the ki main ...,rani ka job karna mummy ko theek nahi lagega,0,रानी वो चड्ढा अंकल कह रहे हैं की मैं उनके वहा...,रानी का जॉब करना मम्मी को ठीक नहीं लगेगा,rani that chaddha uncle is saying that i shoul...,mummy will not like to do queen s job
1,bauji arre yeh chutney chakho mother nahi nahi...,mom ro rahi hai,0,बाउजी अरे ये चटनी चाखो.. मां नहीं नहीं हम...,माँ रो रही है,bauji hey taste this chutney mother no we don ...,mom is crying
2,sara mei n t oh rah oon gi na hin l eki n tumh...,deepak ko bhi aisa feel ho raha hai ki ek aur ...,0,"सारा मेई एन टी ओह रहूं गी न हिन, ल एक न तुम्...",दीपक को भी ऐसा महसूस होता है कि एक और नामांकित...,sara mei nt oh rahoon gi na hin it would have ...,deepak too feels that it would have been bette...
3,shahid nahin kiya rajesh ruthe ho shahid nahin...,rajesh ne shahid se kaha ki tum prachand murkh...,0,शाहिद ​​नहीं किया। राजेश रूठे हो? शाहिद...,राजेश ने शाहिद से कहा की तुम प्रचंड मुड़ नहीं हो।,shahid didn t rajesh are you angry shahid no r...,rajesh told shahid that you are not very twisted
4,khalujan babban babban haan poore plan ki ammi...,khalujan ne kidnapping karwaai hai,0,खलुजान बब्बन..? बब्बन हां ! पूरे प्लान क...,खालजान ने अपहरण करवाई है,khalujan babban babban yes you are busy doing ...,khaljan has kidnapped


In [114]:
df_train.to_csv('data/train.csv')
df_test.to_csv('data/test.csv')

In [19]:
combined_df['premise'] = combined_df['english_premise']
combined_df['hypothesis'] = combined_df['english_hypothesis']

In [20]:
combined_df = combined_df.sample(frac=1).reset_index(drop=True)

In [21]:
combined_df['labels'] = combined_df['labels'].apply(label_encode)

In [22]:
combined_df.head()

Unnamed: 0.1,Unnamed: 0,premise,hypothesis,labels,english_premise,english_hypothesis
0,1253,Mr. Kapoor: What is this? ## Mr. Kapoor: Swimm...,Mr. Kapoor's company is doing research on micr...,1,Mr. Kapoor: What is this? ## Mr. Kapoor: Swimm...,Mr. Kapoor's company is doing research on micr...
1,1386,"Mother: Arjun, I felt that you were not seriou...",Mother said that Arjun wanted to do something ...,1,"Mother: Arjun, I felt that you were not seriou...",Mother said that Arjun wanted to do something ...
2,1679,Umar : Wait… why am I running… do I also belly...,OMAR is from England,0,Umar : Wait… why am I running… do I also belly...,OMAR is from England
3,1756,"Shahid: Sorry, I'm late again. ## Mary: Heard ...",Shahid is serving food,1,"Shahid: Sorry, I'm late again. ## Mary: Heard ...",Shahid is serving food
4,1518,Ishaan: Won.. Yeh lo saloon.. End aah!! INDUST...,Ishaan misses Baka a lot.,0,Ishaan: Won.. Yeh lo saloon.. End aah!! INDUST...,Ishaan misses Baka a lot.


In [23]:
df_train, df_test = train_test_split(
    combined_df,
    test_size = 0.2,
    random_state = 0
    )

In [24]:
df_train[df_train['labels'] == 1].shape

(739, 6)

In [25]:
df_train.drop(columns=['Unnamed: 0'],inplace=True)
df_test.drop(columns=['Unnamed: 0'],inplace=True)

In [26]:
df_train.to_csv('combined_english_train.csv')
df_test.to_csv('combined_english_test.csv')

In [99]:
# train_raw_df['premise'] = train_raw_df['premise'].apply(preprocess_text)
# test_raw_df['premise'] = test_raw_df['premise'].apply(preprocess_text)
# train_raw_df['hypothesis'] = train_raw_df['hypothesis'].apply(lambda x: preprocess_text(x,'hypothesis'))
# test_raw_df['hypothesis'] = test_raw_df['hypothesis'].apply(lambda x: preprocess_text(x,'hypothesis'))

In [100]:
train_raw_df.drop(columns=['Unnamed: 0'],inplace=True)
test_raw_df.drop(columns=['Unnamed: 0'],inplace=True)

In [102]:
train_raw_df.to_csv('train.csv')
test_raw_df.to_csv('test.csv')

In [67]:
df = pd.read_csv('combined_hinglish_train.csv')