In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
import string
import csv
import operator
import string
from bs4 import BeautifulSoup
from collections import defaultdict
from wordcloud import WordCloud, STOPWORDS
from nltk.tokenize import word_tokenize
from subprocess import check_output
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [2]:
#Reading input files

dataframes = {
    "cooking": pd.read_csv("../input/cooking.csv"),
    "crypto": pd.read_csv("../input/crypto.csv"),
    "robotics": pd.read_csv("../input/robotics.csv"),
    "biology": pd.read_csv("../input/biology.csv"),
    "travel": pd.read_csv("../input/travel.csv"),
    "diy": pd.read_csv("../input/diy.csv"),
}

In [3]:
#Showing the first 5 rows of cooking file
dataframes["cooking"].head(5)

Unnamed: 0,id,title,content,tags
0,1,How can I get chewy chocolate chip cookies?,<p>My chocolate chips cookies are always too c...,baking cookies texture
1,2,How should I cook bacon in an oven?,<p>I've heard of people cooking bacon in an ov...,oven cooking-time bacon
2,3,What is the difference between white and brown...,"<p>I always use brown extra large eggs, but I ...",eggs
3,4,What is the difference between baking soda and...,<p>And can I use one in place of the other in ...,substitutions please-remove-this-tag baking-so...
4,5,"In a tomato sauce recipe, how can I cut the ac...",<p>It seems that every time I make a tomato sa...,sauce pasta tomatoes italian-cuisine


In [4]:
#Function for Cleaning the data

uri_re = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'

def stripTagsAndUris(x):
    if x:
        # BeautifulSoup on content
        soup = BeautifulSoup(x, "html.parser")
        # Stripping all <code> tags with their content if any
        if soup.code:
            soup.code.decompose()
        # Get all the text out of the html
        text =  soup.get_text()
        # Returning text stripping out all uris
        return re.sub(uri_re, "", text)
    else:
        return ""

In [5]:
# Content fiels in input file needs cleaning. Running cleaning function for content column
for df in dataframes.values():
    df["content"] = df["content"].map(stripTagsAndUris)

In [10]:
#Showing first 5 rows of input file after cleaning
dataframes["cooking"].head(5)

Unnamed: 0,id,title,content,tags
0,1,how can i get chewy chocolate chip cookies,my chocolate chips cookies are always too cris...,baking cookies texture
1,2,how should i cook bacon in an oven,i ve heard of people cooking bacon in an oven ...,oven cooking-time bacon
2,3,what is the difference between white and brown...,i always use brown extra large eggs but i can...,eggs
3,4,what is the difference between baking soda and...,and can i use one in place of the other in cer...,substitutions please-remove-this-tag baking-so...
4,5,in a tomato sauce recipe how can i cut the ac...,it seems that every time i make a tomato sauce...,sauce pasta tomatoes italian-cuisine


In [8]:
#Removing punctuation marks from cleaned input file
def removePunctuation(x):
    # Lowercasing all words
    x = x.lower()
    # Removing non ASCII chars
    x = re.sub(r'[^\x00-\x7f]',r' ',x)
    # Removing (replacing with empty spaces actually) all the punctuations
    return re.sub("["+string.punctuation+"]", " ", x)

In [9]:
#Removing punctuation from title and content column of cleaned input file
for df in dataframes.values():
    df["title"] = df["title"].map(removePunctuation)
    df["content"] = df["content"].map(removePunctuation)

In [11]:
#Showing first 5 rows after removing punctuation
dataframes["cooking"].head(5)

Unnamed: 0,id,title,content,tags
0,1,how can i get chewy chocolate chip cookies,my chocolate chips cookies are always too cris...,baking cookies texture
1,2,how should i cook bacon in an oven,i ve heard of people cooking bacon in an oven ...,oven cooking-time bacon
2,3,what is the difference between white and brown...,i always use brown extra large eggs but i can...,eggs
3,4,what is the difference between baking soda and...,and can i use one in place of the other in cer...,substitutions please-remove-this-tag baking-so...
4,5,in a tomato sauce recipe how can i cut the ac...,it seems that every time i make a tomato sauce...,sauce pasta tomatoes italian-cuisine


In [12]:
#Removing stopword function
stops = set(stopwords.words("english"))
def removeStopwords(x):
    # Removing all the stopwords
    filtered_words = [word for word in x.split() if word not in stops]
    return " ".join(filtered_words)

In [13]:
#Removing stopwords from content and title column
for df in dataframes.values():
    df["title"] = df["title"].map(removeStopwords)
    df["content"] = df["content"].map(removeStopwords)

In [14]:
#Showing first 5 rows after removing stopwords
dataframes["cooking"].head(5)

Unnamed: 0,id,title,content,tags
0,1,get chewy chocolate chip cookies,chocolate chips cookies always crisp get chewy...,baking cookies texture
1,2,cook bacon oven,heard people cooking bacon oven laying strips ...,oven cooking-time bacon
2,3,difference white brown eggs,always use brown extra large eggs honestly say...,eggs
3,4,difference baking soda baking powder,use one place certain recipes,substitutions please-remove-this-tag baking-so...
4,5,tomato sauce recipe cut acidity,seems every time make tomato sauce pasta sauce...,sauce pasta tomatoes italian-cuisine


In [15]:
#Splitting into words
for df in dataframes.values():
    df["tags"] = df["tags"].map(lambda x: x.split())

In [16]:
#Showing first 5 rows after splitting
dataframes["cooking"].head(5)

Unnamed: 0,id,title,content,tags
0,1,get chewy chocolate chip cookies,chocolate chips cookies always crisp get chewy...,"[baking, cookies, texture]"
1,2,cook bacon oven,heard people cooking bacon oven laying strips ...,"[oven, cooking-time, bacon]"
2,3,difference white brown eggs,always use brown extra large eggs honestly say...,[eggs]
3,4,difference baking soda baking powder,use one place certain recipes,"[substitutions, please-remove-this-tag, baking..."
4,5,tomato sauce recipe cut acidity,seems every time make tomato sauce pasta sauce...,"[sauce, pasta, tomatoes, italian-cuisine]"


In [17]:
#Saving as new files
for name, df in dataframes.items():
    # Saving to file
    df.to_csv(name + "_new.csv", index=False)

In [18]:
#Reading new files
dataframes1 = {
    "cooking": pd.read_csv("cooking_new.csv"),
    "crypto": pd.read_csv("crypto_new.csv"),
    "robotics": pd.read_csv("robotics_new.csv"),
    "biology": pd.read_csv("biology_new.csv"),
    "travel": pd.read_csv("travel_new.csv"),
    "diy": pd.read_csv("diy_new.csv"),
}

In [19]:
#Reading test file
physic = pd.read_csv("../input/test.csv")

In [None]:
#Showing first 5 rows of test file
dataframes1["cooking"].head(5)

In [20]:
#Cleaning the test file
punctuations = string.punctuation

def data_clean(data):
    print('Cleaning data')
    data = data.apply(lambda x: x.lower())
    data = data.apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
    data = data.apply(lambda x: re.sub(r'^\W+|\W+$',' ',x))
    data = data.apply(lambda i: ''.join(i.strip(punctuations))  )
    #print('tokenize')
    data = data.apply(lambda x: word_tokenize(x))

    #Select only the nouns
    is_noun = lambda pos: pos[:2] == 'NN' 
    for i in range(len(data)):
        data[i] = [word for (word, pos) in nltk.pos_tag(data[i]) if is_noun(pos)]
    
    #print('Lemmatizing')
    wordnet_lemmatizer = WordNetLemmatizer()
    data = data.apply(lambda x: [wordnet_lemmatizer.lemmatize(i) for i in x])
    data = data.apply(lambda x: [i for i in x if len(i)>2])
    return(data)

In [21]:
#Get word frequency function
def get_frequency(title):
    
    frequency = []
    inverse_frequency = {}
    for i in range(len(title)):
        word_count = {}

        for word in title[i]:
            if word in word_count:    
                word_count[word] = word_count[word] + 1
            else:
                word_count[word] = 1
                
        for word in word_count:
            if word in inverse_frequency:
                inverse_frequency[word] = inverse_frequency[word] + 1
            else:
                inverse_frequency[word] = 1            
        frequency.append(word_count)
        
    return (frequency, inverse_frequency)

In [22]:
#Cleaning title column of test file
title = data_clean(physic.title)

Cleaning data


In [23]:
#Getting frequency
frequency, inverse_frequency = get_frequency(title)


In [26]:
#Frequency of words in sorted order
import operator
frequency_words = {}
for document in frequency:
    for word in document:
        if word in frequency_words:
            frequency_words[word] = frequency_words[word] + document[word]
        else:
            frequency_words[word] = document[word]            
frequency_words = sorted(frequency_words.values())

In [27]:
#Print words with their corresponding frequency
print('number of words:',len(frequency_words))

number of words: 14265


In [28]:
tfidf = frequency
tfidf_distribution = []
for document in tfidf:
    if document == {}:
        continue
    max_frequency = sorted(document.items(), key=operator.itemgetter(1), reverse=True)[0][1]
    for word in document:
        document[word] = document[word]/(max_frequency + 0.0)*np.log(len(tfidf)/(inverse_frequency[word]+0.))
        tfidf_distribution.append(document[word])

In [29]:
index = 1
sorted(tfidf[index].items(), key=operator.itemgetter(1), reverse=True)

[('explanation', 5.4584997575763294),
 ('string', 5.2709388460963753),
 ('theory', 3.7091753309824185)]

In [30]:
print(physic.title[index])
print(physic.content[index])

What is your simplest explanation of the string theory?
<p>How would you explain string theory to non physicists such as myself? I'm specially interested on how plausible is it and what is needed to successfully prove it?</p>



In [31]:
#Generate upto 8 tags for particular question
top = 8
output = []
for i in range(len(physic)):
    prediction = sorted(tfidf[i], key=tfidf[i].get, reverse=True)[0:top]
    output.append([physic.id[i], ' '.join(prediction)])

In [32]:
pd.DataFrame(data=output,columns = ['id','tags']).to_csv('submission.csv', index=False)

In [33]:
dataframes3 = {"sub": pd.read_csv("submission.csv")}

In [34]:
dataframes3["sub"].head(5)

Unnamed: 0,id,tags
0,1,particle
1,2,explanation string theory
2,3,lie representation physic theory particle
3,7,
4,9,hamilton principle


In [35]:
print("Completed")

Completed
