In [116]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import re

In [117]:
df_questions = pd.read_csv('Questions.csv', encoding='iso-8859-1')
df_tags = pd.read_csv('Tags.csv', encoding='iso-8859-1')
df_questions.head()

Unnamed: 0,Id,OwnerUserId,CreationDate,Score,Title,Body
0,6,5.0,2010-07-19T19:14:44Z,272,The Two Cultures: statistics vs. machine learning?,"<p>Last year, I read a blog post from <a href=""http://anyall.org/"">Brendan O'Connor</a> entitled <a href=""http://anyall.org/blog/2008/12/statistics-vs-machine-learning-fight/"">""Statistics vs. Machine Learning, fight!""</a> that discussed some of the differences between the two fields. <a href=""http://andrewgelman.com/2008/12/machine_learnin/"">Andrew Gelman responded favorably to this</a>:</p>\..."
1,21,59.0,2010-07-19T19:24:36Z,4,Forecasting demographic census,"<p>What are some of the ways to forecast demographic census with some validation and calibration techniques?</p>\n\n<p>Some of the concerns:</p>\n\n<ul>\n<li>Census blocks vary in sizes as rural\nareas are a lot larger than condensed\nurban areas. Is there a need to account for the area size difference?</li>\n<li>if let's say I have census data\ndating back to 4 - 5 census periods,\nhow far ca..."
2,22,66.0,2010-07-19T19:25:39Z,208,Bayesian and frequentist reasoning in plain English,<p>How would you describe in plain English the characteristics that distinguish Bayesian from Frequentist reasoning?</p>\n
3,31,13.0,2010-07-19T19:28:44Z,138,What is the meaning of p values and t values in statistical tests?,"<p>After taking a statistics course and then trying to help fellow students, I noticed one subject that inspires much head-desk banging is interpreting the results of statistical hypothesis tests. It seems that students easily learn how to perform the calculations required by a given test but get hung up on interpreting the results. Many computerized tools report test results in terms of ""p ..."
4,36,8.0,2010-07-19T19:31:47Z,58,Examples for teaching: Correlation does not mean causation,"<p>There is an old saying: ""Correlation does not mean causation"". When I teach, I tend to use the following standard examples to illustrate this point:</p>\n\n<ol>\n<li>number of storks and birth rate in Denmark;</li>\n<li>number of priests in America and alcoholism;</li>\n<li>in the start of the 20th century it was noted that there was a strong correlation between 'Number of radios' and 'Numb..."


In [118]:
df_questions.shape

(85085, 6)

In [119]:
grouped_tags = df_tags.groupby("Tag", sort='count').size().reset_index(name='count')
grouped_tags.Tag.describe()

count     1315
unique    1315
top       lars
freq         1
Name: Tag, dtype: object

In [120]:
num_classes = 100
grouped_tags = df_tags.groupby("Tag").size().reset_index(name='count')
most_common_tags = grouped_tags.nlargest(num_classes, columns="count")
df_tags.Tag = df_tags.Tag.apply(lambda tag : tag if tag in most_common_tags.Tag.values else None)
df_tags = df_tags.dropna()

In [121]:
import re 

def strip_html_tags(body):
    regex = re.compile('<.*?>')
    return re.sub(regex, '', body)

df_questions['Body'] = df_questions['Body'].apply(strip_html_tags)
df_questions['Text'] = df_questions['Title'] + ' ' + df_questions['Body']

In [122]:
# denormalize tables

def tags_for_question(question_id):
    return df_tags[df_tags['Id'] == question_id].Tag.values

def add_tags_column(row):
    row['Tags'] = tags_for_question(row['Id'])
    return row

df_questions = df_questions.apply(add_tags_column, axis=1)

In [123]:
pd.set_option('display.max_colwidth', 400)
df_main=df_questions[[ 'Text', 'Tags']]

In [124]:
df_main.shape

(85085, 2)

In [125]:
tags_counts=df_main["Tags"].apply(lambda text: len(text))
print(tags_counts.value_counts())

1    27729
2    27704
3    14931
0     8720
4     5022
5      979
Name: Tags, dtype: int64


In [126]:
import nltk
flat_list = [item for sublist in df_questions['Tags'].values for item in sublist]

keywords = nltk.FreqDist(flat_list)

In [127]:
df_fdist=pd.DataFrame(keywords.items(), columns=['word', 'frequency'])
#sorting tags based upon frequency
tag_df_sorted = df_fdist.sort_values(['frequency'], ascending = False)
tag_counts = tag_df_sorted['frequency'].values
tag_df_sorted

Unnamed: 0,word,frequency
15,r,13236
17,regression,10959
0,machine-learning,6089
10,time-series,5559
40,probability,4217
...,...,...
54,nonlinear-regression,514
72,cox-model,510
81,monte-carlo,504
36,proportion,503


In [128]:
import plotly.express as px
fig = px.bar(tag_df_sorted.head(20), x='word', y='frequency',
             labels={'Top 20 Tags':'Top 20 Tags'}, height=400)
fig.show()

In [129]:
frq=[]
for x in range(1,101):
    if x%10==0:
        value_explained=(tag_df_sorted["frequency"].head(x).sum()/tag_df_sorted["frequency"].sum())*100
        print("number of tags {}-->{}".format(x, value_explained))
        frq.append(value_explained)
    else:
        frq.append((tag_df_sorted["frequency"].head(x).sum()/tag_df_sorted["frequency"].sum())*100)

number of tags 10-->37.51087219530059
number of tags 20-->52.33498786891892
number of tags 30-->62.37272174373663
number of tags 40-->70.3445750197825
number of tags 50-->76.81622883600478
number of tags 60-->82.73397291270199
number of tags 70-->88.17432134612491
number of tags 80-->92.53235499924793
number of tags 90-->96.52416733698247
number of tags 100-->100.0


In [130]:
tag_df_sorted["frequency"].head(75).sum()

138258

In [131]:
# tag_df_sorted["frequency"].sum()

In [132]:
# 138258/152913

In [133]:
# tag_df_sorted["frequency"].head(80).sum()

In [134]:
# tag_df_sorted["frequency"].sum()

In [135]:
# 141494/152913

In [136]:
import plotly.express as px
fig = px.line(frq)
fig.show()

In [137]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [138]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
#Stemmering the word
sno = nltk.stem.SnowballStemmer('english') #initialising the snowball stemmer

In [139]:
df_main["Text"][0]

'The Two Cultures: statistics vs. machine learning? Last year, I read a blog post from Brendan O\'Connor entitled "Statistics vs. Machine Learning, fight!" that discussed some of the differences between the two fields.  Andrew Gelman responded favorably to this:\n\nSimon Blomberg: \n\n\n  From R\'s fortunes\n  package: To paraphrase provocatively,\n  \'machine learning is statistics minus\n  any checking of models and\n  assumptions\'.\n  -- Brian D. Ripley (about the difference between machine learning\n  and statistics) useR! 2004, Vienna\n  (May 2004) :-) Season\'s Greetings!\n\n\nAndrew Gelman:\n\n\n  In that case, maybe we should get rid\n  of checking of models and assumptions\n  more often. Then maybe we\'d be able to\n  solve some of the problems that the\n  machine learning people can solve but\n  we can\'t!\n\n\nThere was also the "Statistical Modeling: The Two Cultures" paper by Leo Breiman in 2001 which argued that statisticians rely too heavily on data modeling, and that m

In [140]:
def clean_title(text):
    text=text.lower()
    cleanr = re.compile('<.*?>')
    text = re.sub(cleanr, ' ', str(text))
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"that's", "that is", text)        
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text) 
    text = re.sub(r"\'ll", " will", text)  
    text = re.sub(r"\'ve", " have", text)  
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"did't", "did not", text)
    text = re.sub(r"can't", "can not", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"couldn't", "could not", text)
    text = re.sub(r"have't", "have not", text)
    text=re.sub(r'[^#+ a-zA-Z]',' ', text)
    text=re.sub('\s+',' ',text)
    text=re.sub(r'[^0-9#+ a-zA-Z\s]',' ', text)
    return text

def text_normalization(text):
    #stopword removal and stemming
    string=""
    for word in text.split():
        if not word in stop_words:
            word=(sno.stem(word))
            string += word + " "
    return string

title=[]
for x in df_main["Text"]:
    #print(x)
    x=clean_title(x)
    clr_title=text_normalization(x)
    title.append(clr_title)
    #print(clr_title)
    #print("==========")
df_main["Text"]=title



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [143]:
df_main["Text"][0]

'two cultur statist vs machin learn last year read blog post brendan connor entitl statist vs machin learn fight discuss differ two field andrew gelman respond favor simon blomberg r fortun packag paraphras provoc machin learn statist minus check model assumpt brian ripley differ machin learn statist user vienna may season greet andrew gelman case mayb get rid check model assumpt often mayb would abl solv problem machin learn peopl solv also statist model two cultur paper leo breiman argu statistician reli heavili data model machin learn techniqu make progress instead reli predict accuraci model statist field chang last decad respons critiqu two cultur still exist statist grown embrac machin learn techniqu neural network support vector machin '

In [144]:
df_main["Tags"][6]

array(['time-series'], dtype=object)

In [145]:
Top_75_tag=tag_df_sorted["word"].head(75).to_list()
len(Top_75_tag)

75

In [146]:
import numpy as np
selected_tag_row=[]
count=0
for train_tags in df_main["Tags"]:
    founds_tags=[x for x in train_tags if x in Top_75_tag]
    if len(founds_tags)<1:
        founds_tags=np.nan
        #print(count)
    count=count+1
    selected_tag_row.append(founds_tags)

In [147]:
selected_tag_row

[['machine-learning'],
 ['forecasting'],
 ['bayesian'],
 ['hypothesis-testing', 't-test', 'p-value', 'interpretation'],
 ['correlation'],
 ['nonparametric', 'survival'],
 ['time-series'],
 ['data-visualization', 'references'],
 ['machine-learning'],
 ['references'],
 ['classification'],
 ['bayesian', 'references'],
 nan,
 ['time-series'],
 ['sample-size'],
 ['r', 'time-series', 'poisson'],
 ['regression'],
 ['t-test'],
 ['references'],
 ['data-visualization'],
 ['standard-deviation', 'variance', 'anova'],
 ['modeling'],
 ['clustering'],
 ['estimation'],
 ['regression', 'distributions', 'data-transformation'],
 ['r', 'clustering', 'feature-selection'],
 ['self-study', 'econometrics', 'autocorrelation'],
 ['machine-learning'],
 ['references'],
 ['classification', 'confidence-interval', 'nonparametric', 'bootstrap'],
 ['hypothesis-testing', 'confidence-interval'],
 ['confidence-interval', 'spss'],
 ['chi-squared'],
 ['clustering'],
 ['time-series', 'neural-networks'],
 ['t-test'],
 ['vari

In [148]:
df_main["Tags"]=selected_tag_row



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [149]:
df_main.head()

Unnamed: 0,Text,Tags
0,two cultur statist vs machin learn last year read blog post brendan connor entitl statist vs machin learn fight discuss differ two field andrew gelman respond favor simon blomberg r fortun packag paraphras provoc machin learn statist minus check model assumpt brian ripley differ machin learn statist user vienna may season greet andrew gelman case mayb get rid check model assumpt often mayb wou...,[machine-learning]
1,forecast demograph census way forecast demograph census valid calibr techniqu concern census block vari size rural area lot larger condens urban area need account area size differ let say census data date back census period far forecast futur census zone chang light boundari account chang method valid census forecast exampl data exist census period model first test latter two anoth way state p...,[forecasting]
2,bayesian frequentist reason plain english would describ plain english characterist distinguish bayesian frequentist reason,[bayesian]
3,mean p valu valu statist test take statist cours tri help fellow student notic one subject inspir much head desk bang interpret result statist hypothesi test seem student easili learn perform calcul requir given test get hung interpret result mani computer tool report test result term p valu valu would explain follow point colleg student take first cours statist p valu mean relat hypothesi tes...,"[hypothesis-testing, t-test, p-value, interpretation]"
4,exampl teach correl mean causat old say correl mean causat teach tend use follow standard exampl illustr point number stork birth rate denmark number priest america alcohol start th centuri note strong correl number radio number peopl insan asylum favorit pirat caus global warm howev refer exampl whilst amus obvious fals anyon good exampl,[correlation]


In [150]:
df_main.to_csv("train_preprocessed.csv")

In [161]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np
import re

In [162]:
df_questions = pd.read_csv('train_preprocessed.csv')
df_questions.head()

Unnamed: 0.1,Unnamed: 0,Text,Tags
0,0,two cultur statist vs machin learn last year read blog post brendan connor entitl statist vs machin learn fight discuss differ two field andrew gelman respond favor simon blomberg r fortun packag paraphras provoc machin learn statist minus check model assumpt brian ripley differ machin learn statist user vienna may season greet andrew gelman case mayb get rid check model assumpt often mayb wou...,['machine-learning']
1,1,forecast demograph census way forecast demograph census valid calibr techniqu concern census block vari size rural area lot larger condens urban area need account area size differ let say census data date back census period far forecast futur census zone chang light boundari account chang method valid census forecast exampl data exist census period model first test latter two anoth way state p...,['forecasting']
2,2,bayesian frequentist reason plain english would describ plain english characterist distinguish bayesian frequentist reason,['bayesian']
3,3,mean p valu valu statist test take statist cours tri help fellow student notic one subject inspir much head desk bang interpret result statist hypothesi test seem student easili learn perform calcul requir given test get hung interpret result mani computer tool report test result term p valu valu would explain follow point colleg student take first cours statist p valu mean relat hypothesi tes...,"['hypothesis-testing', 't-test', 'p-value', 'interpretation']"
4,4,exampl teach correl mean causat old say correl mean causat teach tend use follow standard exampl illustr point number stork birth rate denmark number priest america alcohol start th centuri note strong correl number radio number peopl insan asylum favorit pirat caus global warm howev refer exampl whilst amus obvious fals anyon good exampl,['correlation']


In [163]:
df_questions.head()

Unnamed: 0.1,Unnamed: 0,Text,Tags
0,0,two cultur statist vs machin learn last year read blog post brendan connor entitl statist vs machin learn fight discuss differ two field andrew gelman respond favor simon blomberg r fortun packag paraphras provoc machin learn statist minus check model assumpt brian ripley differ machin learn statist user vienna may season greet andrew gelman case mayb get rid check model assumpt often mayb wou...,['machine-learning']
1,1,forecast demograph census way forecast demograph census valid calibr techniqu concern census block vari size rural area lot larger condens urban area need account area size differ let say census data date back census period far forecast futur census zone chang light boundari account chang method valid census forecast exampl data exist census period model first test latter two anoth way state p...,['forecasting']
2,2,bayesian frequentist reason plain english would describ plain english characterist distinguish bayesian frequentist reason,['bayesian']
3,3,mean p valu valu statist test take statist cours tri help fellow student notic one subject inspir much head desk bang interpret result statist hypothesi test seem student easili learn perform calcul requir given test get hung interpret result mani computer tool report test result term p valu valu would explain follow point colleg student take first cours statist p valu mean relat hypothesi tes...,"['hypothesis-testing', 't-test', 'p-value', 'interpretation']"
4,4,exampl teach correl mean causat old say correl mean causat teach tend use follow standard exampl illustr point number stork birth rate denmark number priest america alcohol start th centuri note strong correl number radio number peopl insan asylum favorit pirat caus global warm howev refer exampl whilst amus obvious fals anyon good exampl,['correlation']


In [164]:
df_questions.drop(['Unnamed: 0'], axis = 1,inplace=True)

In [160]:
df_questions.to_csv("train_preprocessed.csv")