# Importing Modules

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
que1 = train['question1'][3]
que2 = train['question2'][3]

In [None]:
print(que1,que2)

In [None]:
grouped = train.groupby('is_duplicate').is_duplicate.count()
total = grouped[0] + grouped[1]
sim_per = grouped[1] / total
no_sim_per = grouped[0] / total
print(("The similar data is {} percent and {} percent are not similar").format(sim_per,no_sim_per))

# Starting preprocessing

In [None]:
import string
import re    
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag
from nltk.corpus import wordnet as wn

In [None]:
stoplist = set(stopwords.words("english"))
%matplotlib inline

In [None]:
replacement_patterns = [  
    (r'won\'t', 'will not'),  
    (r'can\'t', 'cannot'),  
    (r'i\'m', 'i am'),  
    (r'ain\'t', 'is not'),  
    (r'(\w+)\'ll', '\g<1> will'),  
    (r'(\w+)n\'t', '\g<1> not'),  
    (r'(\w+)\'ve', '\g<1> have'),  
    (r'(\w+)\'s', '\g<1> is'),  
    (r'(\w+)\'re', '\g<1> are'),  
    (r'(\w+)\'d', '\g<1> would')
]

class RegexpReplacer(object):  
    def __init__(self, patterns=replacement_patterns):    
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]  
        
    def replace(self, text):    
        s = text    
        for (pattern, repl) in self.patterns:      
            s = re.sub(pattern, repl, s)    
        return s

In [None]:
class Remove_Noise(object):
    
    def __init__(self,stop_word = stoplist):
        self.stop_word = stoplist
    
    def noise_rm(self,doc):
        doc = re.sub('[#$%^&\',:()*+/<=>@[\\]^_``{|}~]',' ',doc)
        doc = re.sub('[0-9]+',' ',doc)
        doc = re.sub('\n','',doc)
        doc = re.sub(' +',' ',doc)
        doc = doc.lower()
        return doc
    
    def lemmatize(self,token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)
        lemmatizer = WordNetLemmatizer()
        return lemmatizer.lemmatize(token, tag)
    
    def tokenize(self,document): 
        #document = unicode(document,'utf-8')
        lemmy = []
        for sent in sent_tokenize(document):
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                if token in self.stop_word:
                    continue
                lemma = self.lemmatize(token, tag)
                lemmy.append(lemma)
        return lemmy
    
    def join_tokens(self,data):
        ans = ' '.join(data)
        return ans

In [None]:
replacer = RegexpReplacer()
remover = Remove_Noise()

In [None]:
def do_all(data):
    rep = replacer.replace(data)
    token = remover.tokenize(rep)
    senten = remover.join_tokens(token)
    return senten
    

In [None]:
train['question1'].fillna(' ', inplace=True)
train['question2'].fillna(' ', inplace=True)
test['question1'].fillna(' ', inplace=True)
test['question2'].fillna(' ', inplace=True)

In [None]:
train['comment_full1'] = train['question1'].apply(do_all)
train['comment_full2'] = train['question2'].apply(do_all)
test['comment_full1'] = test['question1'].apply(do_all)
test['comment_full2'] = test['question2'].apply(do_all)

In [None]:
train.to_pickle('train_processed.pkl')
#test.to_pickle('test_processed.pkl')

In [None]:
test.to_pickle('test_processed.pkl')