# Initialisation du notebook

L'objectif du module est de développer un système de suggestion de tags pour le site Stack Overflow. Celui-ci prendra la forme d’un algorithme de machine learning qui assigne automatiquement plusieurs tags pertinents à une question.

Ce notebook se consacre à l'exploration des données brutes. 

In [1]:
%%javascript
IPython.OutputArea.auto_scroll_threshold=10

<IPython.core.display.Javascript object>

In [2]:
# -*- coding: utf-8 -*-
%matplotlib inline
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
import glob, os, pickle, warnings
import re
from sklearn.preprocessing import MultiLabelBinarizer
import collections

# Déclaration de variables 

data_path = r"C:\OCR\06 _ Catégorisez automatiquement des questions"
   
pd.options.mode.chained_assignment = None # default='warn'
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:,.2f}'.format
warnings.filterwarnings("ignore")

# Chargement et découverte des données brutes

In [3]:
all_files = glob.glob(os.path.join(data_path, "QueryResults*.csv")) 

li = []

for absolute_filename in all_files:
    path, file = os.path.split(absolute_filename)
    df = pd.read_csv(absolute_filename, sep=',', encoding='utf-8', engine='python', error_bad_lines=False)
    li.append(df)

df_brut = pd.concat(li, axis=0, ignore_index=True)

print('Taille du CSV importé : {0}'.format(df_brut.shape))

print(pd.DataFrame(df_brut.dtypes, columns=['Type']))

Taille du CSV importé : (91947, 6)
         Type
Id     int64 
Name   object
Score  int64 
Body   object
Title  object
Tags   object


In [4]:
df_brut.to_pickle("./df_brut.pkl")

Les données extraites ciblent uniquement les posts de type "Question" dont les Id sont compris entre 0 et 600000.

In [None]:
df_brut = pd.read_pickle('./df_brut.pkl')

# Pandas Profiling

In [None]:
profile = df_brut.profile_report(title='Pandas Profiling Report', style={'full_width':True})
profile.to_file(output_file="output.html")

Aucune valeur manquante dans BODY, TITLE et TAGS

In [None]:
import matplotlib.pyplot as plt

# Boxplot of the variable Score
plt.figure(figsize=(5, 5))
plt.title("Boxplot of Score", fontsize=20)

# Boxplot without showing the outliers 
plt.xticks(size=15)
plt.yticks(size=15)
plt.boxplot(df_brut["Score"], showfliers=False, labels=['Score'])
plt.style.use('fivethirtyeight')
plt.show()

In [None]:
posts_tags = df_brut[df_brut.Score >= 3]['Tags']
posts_body = df_brut[df_brut.Score >= 3]['Body']
print("Nombre de posts :", posts_tags.shape[0])

# Analyse de la variable TAGS

In [None]:
posts_tags.head(5)

In [None]:
posts_tags.loc[posts_tags.str.contains('<c#>')].head(10)

In [None]:
# je compte les occurrences de chaque tag dans le dataframe
from collections import Counter
alltags_count = Counter()
posts_tags.apply(lambda x : alltags_count.update([s for s in x.split('>') if s != ''])) ;

In [None]:
print('There are %d tags' % len(alltags_count.keys()))

In [None]:
import matplotlib.pyplot as plt
# Graphique des 30 tags les plus utilisés sur Stackoverflow

top30tags = alltags_count.most_common(30)

plt.figure(figsize=(20, 5))
y_axis = [i[1] for i in top30tags]
label_x = [i[0] for i in top30tags]
ax = plt.axes()
plt.bar(label_x, y_axis)
plt.xticks(rotation=90, fontsize=15)
plt.yticks(fontsize=15)
plt.title("30 most used tags on Stackoverflow",fontsize=20)
plt.xlabel('Tag', fontsize=18)
plt.ylabel('Number of occurrences', fontsize=18)
plt.style.use('fivethirtyeight')
plt.show()

### Nombre de posts impactés par les 100 à 1000 premiers tags

In [None]:
nb_posts_targeted = dict() 

for nb_top_tags in np.linspace(100, 1000, 10, dtype='int'):

    top_tags = [tag[0] for tag in alltags_count.most_common(nb_top_tags)]
    nb_posts_targeted[nb_top_tags] = 0
    
    for tags in posts_tags.str.split('>'):
        for tag in tags:
            if tag in top_tags:
                nb_posts_targeted[nb_top_tags] += 1
                break

    percentage_posts = (nb_posts_targeted[nb_top_tags]/posts_tags.shape[0])*100
    print('With %d tags kept, %.2f %% of the posts are concerned ' % (nb_top_tags, percentage_posts))                        

In [None]:
import matplotlib.pyplot as plt
# Graphique représentant le pourcentage de posts qui ont au moins un des top tags

plt.figure(figsize=(20, 5))
y_axis = [nb_posts_targeted[i]/posts_tags.shape[0]*100 for i in np.arange(100,1100, 100)]
label_x = [str(i) for i in np.arange(100,1100, 100)]
ax = plt.axes()
plt.bar(label_x, y_axis)
plt.xticks(rotation=90, fontsize=15)
plt.yticks(fontsize=15)
plt.title("Percentage of posts that have at least one of the top tags",fontsize=20)
plt.xlabel('Total number of tags kept', fontsize=18)
plt.ylabel('Percentage of posts', fontsize=18)
plt.style.use('fivethirtyeight')
plt.show()

### Nombre de posts impactés par les 10 à 100 premiers tags

In [None]:
nb_posts_targeted = dict() 

for nb_top_tags in np.linspace(10, 100, 10, dtype='int'):

    top_tags = [tag[0] for tag in alltags_count.most_common(nb_top_tags)]
    nb_posts_targeted[nb_top_tags] = 0
    
    for tags in posts_tags.str.split('>'):
        for tag in tags:
            if tag in top_tags:
                nb_posts_targeted[nb_top_tags] += 1
                break

    percentage_posts = (nb_posts_targeted[nb_top_tags]/posts_tags.shape[0])*100
    print('With %d tags kept, %.2f %% of the posts are concerned ' % (nb_top_tags, percentage_posts))                        

In [None]:
import matplotlib.pyplot as plt
# Graphique représentant le pourcentage de posts qui ont au moins un des top tags

plt.figure(figsize=(20, 5))
y_axis = [nb_posts_targeted[i]/posts_tags.shape[0]*100 for i in np.arange(10,100, 10)]
label_x = [str(i) for i in np.arange(10,100, 10)]
ax = plt.axes()
plt.bar(label_x, y_axis)
plt.xticks(rotation=90, fontsize=15)
plt.yticks(fontsize=15)
plt.title("Percentage of posts that have at least one of the top tags",fontsize=20)
plt.xlabel('Total number of tags kept', fontsize=18)
plt.ylabel('Percentage of posts', fontsize=18)
plt.style.use('fivethirtyeight')
plt.show()

In [None]:
def keep_most_common_tags(tags, top_tags, n_tags):
    ''' keep only the tags that are part of the top most used tags on StackOverflow
        filtered tags are sorted by order of appearance in top_tags list

    tags: tags associated to post
    top_tags: most_commonn tags
    n_tags: number of tags by sample
    '''
    
    clean_tags = tags.split('>')
    tags_filtered = []
    
    for tag in clean_tags:
        if tag in top_tags: 
            index = top_tags.index(tag)
            tags_filtered.insert(index,tag)    
    
    tags_filtered = tags_filtered[:n_tags]
    
    return tags_filtered

Je ne vais conserver qu'un nombre réduit de tags, d'une part, parce que c'est plus facile de prédire le bon tag à partir d'un nombre réduit de mots ; d'autre part, parce que dans les 10 000 tags, certains sont extrêmement confidentiels et spécifiques

### Top 100 tags = 74% des posts impactés
### Aucune limitation sur le nombre de tags par post

In [None]:
top100tags = [tag[0] for tag in alltags_count.most_common(100)]

print("Nombre de posts avant : {0}".format(posts_tags.shape[0]))

posts_top100tags = posts_tags.copy()
posts_top100tags = posts_top100tags.apply(lambda x: keep_most_common_tags(x, top100tags, 5))
posts_top100tags = posts_top100tags.apply(lambda x: x if len(x)>0 else None)
posts_top100tags.dropna(inplace=True)

print('Nombre de posts après : {0}'.format(posts_top100tags.shape[0]))

#### Fréquence des tags conservés 

In [None]:
toptags_count = np.asarray([tag[1] for tag in alltags_count.most_common(100)])
toptags_count = pd.Series(toptags_count / sum(toptags_count) * 100, index=top100tags)
toptags_count.apply(lambda x: "{0:.2f}%".format(x))

#### Nombre de tags par post et fréquence

In [None]:
# Binarizing the tags for the supervised models
multilabel_binarizer = MultiLabelBinarizer()
Y = multilabel_binarizer.fit_transform(posts_top100tags)
Y_sum = np.sum(Y,axis=1)
print([*collections.Counter(Y_sum).values()])
print([*collections.Counter(Y_sum).keys()])
print(Y.shape)

In [None]:
import matplotlib.pyplot as plt
# Nombre de tags par document
Y_n_tags = np.sum(multilabel_binarizer.fit_transform(posts_top100tags),axis=1)
# Fréquence du nombre de tags par document
multiLabel_counts = collections.Counter(Y_n_tags)
# clés 
# print([x[0] for x in multiLabel_counts.most_common()])
# valeurs 
# print([x[1] for x in multiLabel_counts.most_common()])
plt.figure(figsize=(15,5)) 
ax = sns.barplot([x[0] for x in multiLabel_counts.most_common()], [x[1] for x in multiLabel_counts.most_common()])
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title("Questions having multiple tags",fontsize=20)
plt.xlabel('Number of tags', fontsize=18)
plt.ylabel('Number of posts', fontsize=18)
#adding the text labels
rects = ax.patches
labels = [x[1] for x in multiLabel_counts.most_common()] / np.sum([x[1] for x in multiLabel_counts.most_common()]) 
labels = ["{:0.1%}".format(x) for x in labels]
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=15)
plt.style.use('fivethirtyeight')    
plt.show()

### Limitation à 3 tags par post

In [None]:
print("Nombre de posts avant : {0}".format(posts_tags.shape[0]))

posts_top100tags_3labels = posts_tags.copy()
posts_top100tags_3labels = posts_top100tags_3labels.apply(lambda x: keep_most_common_tags(x, top100tags, 3))
posts_top100tags_3labels = posts_top100tags_3labels.apply(lambda x: x if len(x)>0 else None)
posts_top100tags_3labels.dropna(inplace=True)

print('Nombre de posts après : {0}'.format(posts_top100tags_3labels.shape[0]))

#### Fréquence des tags conservés 

In [None]:
# je recompte les occurrences de chaque tag vu que j'en ai réduit le nombre
alltags_count_reduced = Counter()
posts_top100tags_3labels.apply(lambda x : alltags_count_reduced.update([s for s in x if s != ''])) ;
toptags_count = np.asarray([tag[1] for tag in alltags_count_reduced.most_common(100)])
toptags_count = pd.Series(toptags_count / sum(toptags_count) * 100, index=top100tags)
toptags_count.apply(lambda x: "{0:.2f}%".format(x))

In [None]:
# Binarizing the tags for the supervised models
multilabel_binarizer = MultiLabelBinarizer()
Y = multilabel_binarizer.fit_transform(posts_top100tags_3labels)
Y_sum = np.sum(Y,axis=1)
print([*collections.Counter(Y_sum).values()])
print([*collections.Counter(Y_sum).keys()])
print(Y.shape)

In [None]:
import matplotlib.pyplot as plt
# Binarizing the tags for the supervised models
multilabel_binarizer = MultiLabelBinarizer()
# Nombre de tags par document
Y_n_tags = np.sum(multilabel_binarizer.fit_transform(posts_top100tags_3labels),axis=1)
# Fréquence du nombre de tags par document
multiLabel_counts = collections.Counter(Y_n_tags)
# clés 
# print([x[0] for x in multiLabel_counts.most_common()])
# valeurs 
# print([x[1] for x in multiLabel_counts.most_common()])
plt.figure(figsize=(15,5)) 
ax = sns.barplot([x[0] for x in multiLabel_counts.most_common()], [x[1] for x in multiLabel_counts.most_common()])
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.title("Questions having multiple tags",fontsize=20)
plt.xlabel('Number of tags', fontsize=18)
plt.ylabel('Number of posts', fontsize=18)
#adding the text labels
rects = ax.patches
labels = [x[1] for x in multiLabel_counts.most_common()] / np.sum([x[1] for x in multiLabel_counts.most_common()]) 
labels = ["{:0.1%}".format(x) for x in labels]
for rect, label in zip(rects, labels):
    height = rect.get_height()
    ax.text(rect.get_x() + rect.get_width()/2, height + 5, label, ha='center', va='bottom', fontsize=15)
plt.style.use('fivethirtyeight')    
plt.show()

In [None]:
# SAUVEGARDE
alltags_counter = alltags_count_reduced
with open('alltags_counter.pkl', 'wb') as f:    
    pickle.dump(alltags_counter, f)
    
posts_body.to_pickle("./posts_body.pkl")
posts_top100tags.to_pickle("./posts_top100tags.pkl")
posts_top100tags_3labels.to_pickle("./posts_top100tags_3labels.pkl")