In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_json('/kaggle/input/github-bugs-prediction/embold_train.json')
train_df["text"] = train_df.title + " " + train_df.body
train_df.head()

### Attribute Description:

    -Title - the title of the GitHub bug, feature question
    -Body - the body of the GitHub bug, feature question
    -Label - Represents various classes of Labels
        Bug - 0
        Feature - 1
        Question - 2
        
    - Text - we combined Title and Body to have whole text feature

### Data quick glance

In [None]:
print(f'We have {train_df.shape[0]} rows and {train_df.shape[-1]} columns')
print(f'\n')
print(f'Remember we combined Title and Body to create new column "Text"')
print(f'columns {train_df.columns}')

In [None]:
train_df.dtypes  # check data types 

In [None]:
def label_encode(data, from_numeric= True):
    
    '''wrappen function to label to numberic code and vice versa'''
    
    if from_numeric:
        if data== 0:
            return 'Bug'
        elif data == 1:
            return 'Feature'
        elif data == 2:
            return 'Question'
        
    else:
        if data== 'Bug':
            return 0
        elif data == 'Feature':
            return 1
        elif data == 'Question':
            return 2

In [None]:
# Lets convert 'label' to its classification label for better visualization 
train_df['label'] = train_df.label.apply(label_encode)

train_df.label = train_df.label.astype('category')  # convert in category data types

train_df.label.head()

### Statistical Analysis-I
Okay lets do some basic descriptive statitical insights on raw training set

In [None]:
import seaborn as sns

In [None]:
train_df.label.value_counts()  # .plot(kind= "bar")

In [None]:
sns.countplot(x='label',data=train_df)
            
print(f'There are too many request for bug  and feature, less on Questioin \nClearly our class is imbalanced')

#### Lets see how character and words play role in different class

In [None]:
train_df['text_len'] = train_df['text'].astype(str).apply(len)
train_df['word_count'] = train_df['text'].apply(lambda x: len(str(x).split()))

In [None]:
plt.figure(figsize=[15,5],frameon=True)

plt.subplot(1,2,1)
order_index = train_df.word_count.value_counts().index
sns.distplot(train_df.word_count,kde = False)
plt.title('Overall number of words used')

plt.subplot(1,2,2)

order_index = train_df.text_len.value_counts().index
sns.distplot(train_df.text_len,kde = False)
plt.title('Overall number characters used')


# plt.close()

- Majority used words less than 200
- Number of characters used are fairly in a range

below plotly version of above visualization

In [None]:
from plotly.offline import iplot
import seaborn as sns

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

In [None]:
train_df['text_len'].iplot(
    kind='hist',
    bins=100,
    xTitle='Text  length',
    linecolor='black',
    yTitle='count',
    title='Text Length Distribution')

In [None]:
train_df['word_count'].iplot(
    kind='hist',
    bins=100,
    xTitle='word count',
    linecolor='black',
    yTitle='count',
    title='word count Distribution')

### So how they (words used , characater used) do across category ?

In [None]:
sns.catplot(x="label", y="text_len", data= train_df)

g = sns.FacetGrid(train_df, col="label")
g.map(sns.distplot, "text_len",kde = False)

In [None]:
sns.catplot(x="label", y="word_count", data= train_df)

g = sns.FacetGrid(train_df, col="label")
g.map(sns.distplot, "word_count",kde = False)

### Punctuation used in across class ?

In [None]:
import string

In [None]:
# count_Bug_punctuations      = train_df[train_df.label == 'Bug']['text'].apply(lambda z: len([c for c in str(z) if c in string.punctuation]))
# count_Feature_punctuations  = train_df[train_df.label == 'Feature']['text'].apply(lambda z: len([c for c in str(z) if c in string.punctuation]))
# count_Question_punctuations = train_df[train_df.label == 'Question']['text'].apply(lambda z: len([c for c in str(z) if c in string.punctuation]))

In [None]:
train_df['count_punctuations'] = train_df.text.apply(lambda z: len([c for c in str(z) if c in string.punctuation]))

In [None]:
g = sns.FacetGrid(train_df, col="label" , height = 4, aspect = 1 , sharex = True , sharey = True)
g.map(sns.distplot, "count_punctuations",kde = False)

# print(f'X and Y range are different, so better look figure carefully !!')

"Bug" level has max punctuations - It make sense as report a bug would certainly contains more punctuations if code or code logs are mention 

### Stopwords usage across class ?

In [None]:
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
train_df['stop_words'] = train_df.text.apply(lambda z : np.mean([len(z) for w in str(z).split()]))

g = sns.FacetGrid(train_df, col="label" , height = 4, aspect = 1 , sharex = True , sharey = True )
g.map(sns.distplot, "stop_words",kde = False)

# print(f'X and Y range are different, so better look figure carefully !!')

Similar insight for punctuations 
- "Bug" level has max stopwods - It make sense as report a bug would certainly contains more stopwords if code or code logs are mention 

In [None]:
train_df['check_url'] = train_df.text.apply(lambda x: len([w for w in str(x).lower().split() if 'http' in w or 'https' in w]))

g = sns.FacetGrid(train_df, col="label" , height = 4, aspect = 1 , sharex = True , sharey = True)
g.map(sns.distplot, "check_url",kde = False)

# print(f'X and Y range are different, so better look figure carefully !!')

This also makes match with our comman sense - People will tend to give reference link for adding feature or for tentative reference solutions to a bug

WordCloud Visualizations


In [None]:
import wordcloud
from wordcloud import WordCloud,STOPWORDS

from PIL import Image


In [None]:
def display_cloud(data,color):
    plt.subplots(figsize=(15,15))
    mask = None
    wc = WordCloud(stopwords=STOPWORDS, 
                   mask=mask, background_color="white", contour_width=2, contour_color=color,
                   max_words=2000, max_font_size=256,
                   random_state=42)
    wc.generate(' '.join(data))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis('off')
    plt.show()

In [None]:
display_cloud(train_df['text'],'red')

In [None]:
print(f'For Bug class')
display_cloud(train_df[train_df.label == 'Bug']['text'],'red')

"Bug" class wordcloud
- make sense to have "error" "issue" occuring frequently
- if reported image, most reported in 'png' image format
- Doe you notice python file is frequently used ? The reason I love python 

In [None]:
print(f'For Feature class')
display_cloud(train_df[train_df.label == 'Feature']['text'],'red')

"Feature" class wordcloud
- make sense to have "add" ad occure tokens. Mostly feature to be added right ? Or are you feature deletion guy :) 
- So you already will give reason for "github" words occur too !


In [None]:
print(f'For Question class')
display_cloud(train_df[train_df.label == 'Question']['text'],'red')

## Most occuring words distribution ?

Hey !! , in below bar plot, what we are seeing so many unknown character ?
-  Remember we are not cleaning data yet
-  hmm I just want to show you , text analysis is not a clean job :) 

We`ll clean stopwords later on

In [None]:
#Simplified counter function

from collections import Counter

def create_corpus(word):
    corpus=[]
    
    for x in train_df[train_df['label']==word]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus


In [None]:
stops=set(stopwords.words('english'))

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)})

In [None]:
corpus=create_corpus('Bug')
counter=Counter(corpus)
most=counter.most_common()

x=[]
y=[]
for word,count in most[:100]:
    if (word not in stops) :
        x.append(word)
        y.append(count)
sns.barplot(x=y,y=x)

In [None]:
corpus=create_corpus('Feature')
counter=Counter(corpus)
most=counter.most_common()

x=[]
y=[]
for word,count in most[:100]:
    if (word not in stops) :
        x.append(word)
        y.append(count)
sns.barplot(x=y,y=x)

In [None]:
corpus=create_corpus('Question')
counter=Counter(corpus)
most=counter.most_common()

x=[]
y=[]
for word,count in most[:100]:
    if (word not in stops) :
        x.append(word)
        y.append(count)
sns.barplot(x=y,y=x)

## Inference so far - raw data

- Balance class : "Questions" class is fairly less compared to "Bug" and "Feature" counts, where last two class is almost same counts
- Stopwords : Stopwords contribute a major junk in "Bug" and "Feature" class . Thought they are the majority among class distribution
- Unusual usage of long text length
- Cleaning of text is recommended - Not only stopwords , presence of noise inclusing html ect

## Statistical Analysis-II
N-gram analysis - to be continue

In [None]:
# def gram_analysis(data,gram):
    
#     token= tokenizer.tokenize(data.lower()) 
#     token = [tok for tok in token if len(tok) > 2 if tok not in stopword_list and not tok.isdigit()]
#     ngrams=zip(*[token[i:] for i in range(gram)])
#     final_tokens=[" ".join(z) for z in ngrams]
#     return final_tokens


# def create_dict(data,grams):
#     freq_dict=defaultdict(int)
#     for sentence in data:
#         for tokens in gram_analysis(sentence,grams):
#             freq_dict[tokens]+=1
#     return freq_dict

# def horizontal_bar_chart(df, color):
#     trace = go.Bar(
#         y=df["n_gram_words"].values[::-1],
#         x=df["n_gram_frequency"].values[::-1],
#         showlegend=False,
#         orientation = 'h',
#         marker=dict(
#             color=color,
#         ),
#     )
#     return trace

# def create_new_df(freq_dict,):
#     freq_df=pd.DataFrame(sorted(freq_dict.items(),key=lambda z:z[1])[::-1])
#     freq_df.columns=['n_gram_words','n_gram_frequency']
#     trace=horizontal_bar_chart(freq_df[:20],'orange')
#     return trace

In [None]:
# def plot_grams(df1,df2,df3):
#     fig = tools.make_subplots(rows=1, cols=3, vertical_spacing=0.1,
#                           subplot_titles=["Frequent words of lable 0", 
#                                           "Frequent words of lable 1",
#                                           "Frequent words of lable 2"])
#     fig.append_trace(df1, 1, 1)
#     fig.append_trace(df2, 1, 2)
#     fig.append_trace(df3, 1, 3)
#     fig['layout'].update(height=800, width=1000, paper_bgcolor='rgb(233,233,233)', title="Word Count Plots")
#     py.iplot(fig, filename='word-plots')

In [None]:
# for gram in range(2,4):
    
#     if(gram == 2):
#         print("Bi-gram analysis")
#     else:
#         print("Tri-gram analysis")

#     freq_label_0_zero=create_dict(label_0_df['text'][:400],gram)
#     trace_zero=create_new_df(freq_label_0_zero)
    
#     freq_label_1_ones=create_dict(label_1_df['text'][:400],gram)
#     trace_ones=create_new_df(freq_label_1_ones)
    
#     freq_label_2_ones=create_dict(label_2_df['text'][:400],gram)
#     trace_secs=create_new_df(freq_label_2_ones)
    
#     plot_grams(trace_zero,trace_ones,trace_secs)

In [None]:
# train_df.columns

In [None]:
# count_Bug_punctuations      = train_df[train_df.label == 'Bug']['text'].apply(lambda z: len([c for c in str(z) if c in string.punctuation]))
# count_Feature_punctuations  = train_df[train_df.label == 'Feature']['text'].apply(lambda z: len([c for c in str(z) if c in string.punctuation]))
# count_Question_punctuations = train_df[train_df.label == 'Question']['text'].apply(lambda z: len([c for c in str(z) if c in string.punctuation]))

In [None]:
#Regex cleaning
import re

def remove_punctuations(data):
    punct_tag=re.compile(r'[^\w\s]')
    data=punct_tag.sub(r'',data)
    return data

def remove_html(data):
    html_tag=re.compile(r'<.*?>')
    data=html_tag.sub(r'',data)
    return data

def remove_url(data):
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

def clean_data(data):
    emoji_clean= re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    data=emoji_clean.sub(r'',data)
    url_clean= re.compile(r"https://\S+|www\.\S+")
    data=url_clean.sub(r'',data)
    return data

In [None]:
train_df.text = train_df.text.apply(lambda z : remove_url(z))
train_df.text = train_df.text.apply(lambda z: clean_data(z))
train_df.text = train_df.text.apply(lambda z: remove_html(z))
train_df.text = train_df.text.apply(lambda z: remove_punctuations(z))