### <span style="color:purple">  Importing Necessary Libraries and Files

In [1]:
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tag import *
import time
import itertools
from nltk import FreqDist
import pandas as pd 
import gzip 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display_html
from bs4 import BeautifulSoup 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy import spatial
%matplotlib inline
D1 = pd.read_csv('D1.csv')
D2 = pd.read_csv('D2.csv')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Satya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Satya\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### <span style="color:purple">  Function for Pre Processing 

In [2]:
def preprocessing(File):
    start_time = time.time()
    corpus=[]
    for i in range(0,len(File)): 
        review=BeautifulSoup(File[i]).get_text()  #This Removes HTML Tags
        review= re.sub('[^a-zA-Z]',' ',review)    # Removes Punctuations and other symbols
        review=review.lower().split()             # Converts Everything to one single case
        review=[word for word in review if not word in set(stopwords.words('english'))]  #Removes Stop words
        review=' '.join(review)
        corpus.append(review)
    end_time = time.time()
    return(start_time,end_time,corpus)

### <span style="color:purple">  Function for Vectorization and TF-IDF Score

In [3]:
def tf_idf_score ( File_Name , no_of_features ):
    countvec = CountVectorizer(max_features = no_of_features)
    File_Name=pd.DataFrame(File_Name['Body'])
    File_Name = pd.DataFrame( countvec.fit_transform(File_Name.Body).toarray(), columns=countvec.get_feature_names())
    tfidf_transformer = TfidfTransformer().fit(File_Name)
    File_Name_tf = tfidf_transformer.transform(File_Name)
    File_Name_tf_score = pd.DataFrame(File_Name_tf.toarray())
    column_list = File_Name.columns.values
    df_empty = pd.DataFrame( columns=column_list)
    for i in range(1):
        df_empty.loc[i] = 0
    return tfidf_transformer , File_Name_tf_score , column_list , df_empty

### <span style="color:purple">  Function for finding Top 5 relevant Posts by using Cosine Similarity

In [31]:
def rank_docs (Raw_File_Name,tf_score,column_list,input_post,tfidf_transformer,df_empty):
    input_mod = input_post.join(df_empty, lsuffix='',rsuffix='_other')
    input_mod1= input_mod[column_list]
    tfidf = tfidf_transformer.transform(input_mod1)
    tf_input = pd.DataFrame(tfidf.toarray())
    similarity_score=[]
    for i in range(len(tf_score.index)):
        cosine_similarity = 1-spatial.distance.cosine(tf_input, tf_score.iloc[[i]])
        similarity_score.append(cosine_similarity)
        similarity_df = pd.DataFrame(similarity_score)
        similarity_df.columns = ["similarity_score"]
    top_5 = similarity_df.nlargest(5,'similarity_score')
    bottom_5 = similarity_df.nsmallest(5,'similarity_score')
    input_D1 = pd.read_csv(Raw_File_Name)
    output_top5 = top_5.join(input_D1)[['Title','Body','similarity_score']]
    print('\033[1m{:10s}\033[0m'.format('                   The Top 5  Posts having good match are'))
    display(output_top5)

### <span style="color:purple">  PreProcessing for D1.CSV and D2.CSV

In [5]:
D1 = pd.read_csv('D1.csv')
start_time ,end_time,D1['Body'] = preprocessing(D1['Body'])
print("The Execution Time of the pre processing is %s seconds" % (end_time - start_time))
D2 = pd.read_csv('D2.csv')
D2 = pd.DataFrame(D2.head(10000))
start_time ,end_time,D2['Body'] = preprocessing(D2['Body'])
print("The Execution Time of the pre processing is %s seconds" % (end_time - start_time))
countvec = CountVectorizer()

The Execution Time of the pre processing is 273.6839528083801 seconds
The Execution Time of the pre processing is 558.6939568519592 seconds


### <span style="color:purple">  Vectorization and calculation of TF-IDF Score for D1.CSV and D2.CSV

In [6]:
D1_tfidf_transformer , D1_tf_score , D1_column_list , D1_df_empty = tf_idf_score(D1,600)

In [7]:
D2_tfidf_transformer , D2_tf_score , D2_column_list , D2_df_empty = tf_idf_score(D2,3000)

# <span style="color: BLUE">   Application Interface  for  D1.CSV

### <span style="color:purple">  Reading Input from Text File and Input PreProcessing for "D1.CSV"

In [43]:
input_post = pd.read_csv('InputD1.txt',  header=None) # The Input post must be enetered in this file
input_post.columns = ["Body"]
start_time ,end_time,input_post['Body'] = preprocessing(input_post['Body'])
print("The Execution Time of the pre processing is %s seconds" % (end_time - start_time))
input_post=pd.DataFrame(input_post['Body'])
input_postD1 = pd.DataFrame( countvec.fit_transform(input_post.Body).toarray(), columns=countvec.get_feature_names())

The Execution Time of the pre processing is 0.014993906021118164 seconds


### <span style="color:purple">  Displaying TOP 5 Posts for D1.CSV  by using Cosine Similarity

In [44]:
pd.options.display.max_colwidth = 1000
rank_docs ('D1.csv',D1_tf_score,D1_column_list,input_postD1,D1_tfidf_transformer,D1_df_empty)

[1m                   The Top 5  Posts having good match are[0m


Unnamed: 0,Title,Body,similarity_score
1,How does noise affect generalization?,<p>Does increasing the noise in data help to improve the learning ability of a network? Does it make any difference or does it depend on the problem being solved? How is it affect the generalization process overall?</p>\n,1.0
157,What is the difference between AI and robots?,<p>What is the difference between AI and robots?</p>\n,0.352893
3037,Loss/accuracy on Synthetic data,<p>I am trying to understand if there is any difference in the the interpretation of accuracy and loss on synthetic data vs real data.</p>\n,0.292964
17,"What is the ""dropout"" technique?","<p>What purpose does the ""dropout"" method serve and how does it improve the overall performance of the neural network?</p>\n",0.279287
2989,What are temporal-difference and Monte Carlo methods intuitively?,"<p>Intuitively, how do temporal-difference and Monte Carlo methods work in reinforcement learning? How can they be used to solve the reinforcement learning problem?</p>\n",0.276651


# <span style="color: BLUE">   Application Interface  for  D2.CSV

### <span style="color:purple">  Reading Input from Text File and Input PreProcessing for D2.CSV

In [47]:
input_post = pd.read_csv('InputD2.txt',  header=None)  # The Input post must be enetered in this file
input_post.columns = ["Body"]
start_time ,end_time,input_post['Body'] = preprocessing(input_post['Body'])
print("The Execution Time of the pre processing is %s seconds" % (end_time - start_time))
input_post=pd.DataFrame(input_post['Body'])
input_postD2 = pd.DataFrame( countvec.fit_transform(input_post.Body).toarray(), columns=countvec.get_feature_names())

The Execution Time of the pre processing is 0.014991283416748047 seconds


### <span style="color:purple">  Displaying TOP 5 Posts for D2.CSV by using Cosine Similarity

In [48]:
pd.options.display.max_colwidth = 1000
rank_docs ('D2.csv',D2_tf_score,D2_column_list,input_postD2,D2_tfidf_transformer,D2_df_empty)

[1m                   The Top 5  Posts having good match are[0m


Unnamed: 0,Title,Body,similarity_score
4,Do I really need to install a task manager?,<p>Does one really need to install a task manager? My phone does slowdown on rare occasions but it's not something that hampers regular usage.</p>\n,1.0
130,Is it advisable to run a task killer app on Android,"<blockquote>\n <p><strong>Possible Duplicate:</strong><br>\n <a href=""https://android.stackexchange.com/questions/9/do-i-really-need-to-install-a-task-manager"">Do I really need to install a task manager?</a> </p>\n</blockquote>\n\n\n\n<p>Should I run a task killer on Android or does Android manage applications well enough on its own?</p>\n",0.472942
5009,apps(tasks) restarting even after killing them,"<blockquote>\n <p><strong>Possible Duplicate:</strong><br>\n <a href=""https://android.stackexchange.com/questions/9/do-i-really-need-to-install-a-task-manager"">Do I really need to install a task manager?</a><br>\n <a href=""https://android.stackexchange.com/questions/2258/apps-automatically-starting"">apps automatically starting?</a><br>\n <a href=""https://android.stackexchange.com/q/30332/1465"">How can I stop applications and services from running?</a> </p>\n</blockquote>\n\n\n\n<p>I have some apps running in background using cpu all the time and draining my batery. I want to stop them. </p>\n\n<p>I have a latest <code>task manager</code> to kill the background tasks but BUT some of the tasks(apps) are restarting even after killing them, WHY ?</p>\n\n<p>Is there any way to permanently kill a particular task ?</p>\n",0.341859
244,What is the right way to close apps in Android?,"<p>I'm new to the Android platform, and this is probably a dumb question, but...</p>\n\n<p>How do you close an app?</p>\n\n<p>When I'm in an app, and I want to get out, I just click on ""Home"" and move on, but I just installed a Task Manager app, and I noticed <em>everything</em> is left running.</p>\n\n<p>Is there a way to exit apps? Or do I need to use the Task Manager every time I exit something?</p>\n",0.341206
77,How do I identify apps or settings that cause performance problems?,"<p>If I am having performance problems, what tools can I use to identify where my performance lags are coming from? I have used a <a href=""http://www.appbrain.com/app/nextapp.systempanel.r1"" rel=""noreferrer"">System Panel Task Manager</a>, <a href=""http://www.appbrain.com/app/com.electricsheep.asi"" rel=""noreferrer"">Android System Info</a>, <a href=""http://www.appbrain.com/app/com.droidappfactory.systemmanager"" rel=""noreferrer"">System Manager Free</a> and some others but none of them have really identified the problem children. System Panel Task Manager came the closest with it's historical look at CPU and memory usage but it didn't let me look at <em>which apps</em> were using how much resources over time.</p>\n",0.328852
