In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import re
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

## READ DATASET

In [2]:
df = pd.read_csv("dataset.csv")
df.head(3)

Unnamed: 0,creation_date,component_name,product_name,short_description,long_description,assignee_name,reporter_name,resolution_category,resolution_code,status_category,status_code,update_date,quantity_of_votes,quantity_of_comments,resolution_date,bug_fix_time,severity_category,severity_code
0,2015-05-22,Core,product_123,LogTraceException in ProposalUtils.toMethodNam...,The following incident was reported via the au...,recommenders-inbox,error-reports-inbox,fixed,1,closed,6,2015-05-27,0,2,2015-05-27,5,normal,2
1,2015-03-29,Engine,product_120,CCE in DecorationNodeImpl.eSet (159),The following incident was reported via the au...,serg.boyko2011,error-reports-inbox,fixed,1,resolved,4,2015-04-01,0,8,2015-03-31,2,normal,2
2,2018-01-20,Compendium,product_52,[http servlet] During dispatching javax.servle...,Original issue https://issues.liferay.com/brow...,raymond.auge,raymond.auge,fixed,1,resolved,4,2018-01-22,0,3,2018-01-22,2,normal,2


In [3]:
print("UNIQUE_CLASSES",df.product_name.nunique())


UNIQUE_CLASSES 173


In [4]:
df.creation_date=pd.to_datetime(df.creation_date)
df.update_date=pd.to_datetime(df.update_date)
df.resolution_date = pd.to_datetime(df.resolution_date)

In [5]:
df["year"]=df.creation_date.apply(lambda x :x.year)

#### FEATURES & Intuition behind why they are included or not in training: 

* **creation_date**
    * Date at which bug was reported for the product
    * Don't seems relevant to product 


* **update_date**
    * Data at which any update was made for the bug (comment or anything)
    * Don't seems relevant to product, depends on tester


* **resolution_date**
    * Issue resolution date
    * Don't seems relevant to product, depends on tester
    

* **component_name** 
    * Component related to product which had bug
    * Used as a feature to train model
    * Seems relevant to product info
    

* **short_description**
    * short desctiption of bug found by reporter
    * Used as a feature to train model
    

* **long_description**
    * Complete desctiption with error message & codes of bug 
    * Used as a feature to train model
    

* **assignee_name**
     * To whom issue was assign to.
     * can be a random person name, this feature will just add noise to data. No correlation at all.
     * Not used in training
     

* **reporter_name**
     * Reporter of issue
     * can be a random person name, this feature will just add noise to data. No correlation at all.
     * Not used in training
     

* **resolution_category**
    * Only Unique value is `Fixed`. Doesn;t add any info for model
    * Not used in training
    

* **resolution_code**
    * Interger reperesentaion on resolution_category
    * Not used in training
    

* **status_category**
    * 2 values `closed` OR `resolved` 
    * Not related to product name, it is changed by assignee or reporter. 
    * Even after using it as a feature to train, I found it was not adding any relevant gain to metrics
    * Not used in training
    
     
* **status_code**
    * Interger reperesentaion on status_category
    * Not used in training
    

* **quantity_of_votes**
    * Number of votes to bugs maybe 
    * Not related to product name, it is changed by assignee or reporter. 
    * Even after using it as a feature to train, I found it was not adding any relevant gain to metrics
    * Not used in training
    

* **quantity_of_comments**
    * Number of comments to bugs maybe 
    * Not related to product name, it is changed by assignee or reporter. 
    * Even after using it as a feature to train, I found it was not adding any relevant gain to metrics
    * Not used in training
    

* **bug_fix_time**
    * creation_date - resolution_date 
    * Not usefull in describeing product info, can be random because of availabity of tester's time.
    * Even after using it as a feature to train, I found it was not adding any relevant gain to metrics
    * Not used in training
    
    
* **severity_category**
    * Category of the issue : `['normal', 'blocker', 'trivial', 'minor', 'major', 'critical']`
    

* **severity_code**
    * Interge reperesentaion on severity_category
    * Code `2` is used for both `normal` & `minor` , which don;t seems right. 


* **product_name** 
    * label

## Data Preprocessing

#### Tokenizer to split descriptions &  error messages


* removing `[]` from certain words like [http servlet] which helps in tfidf vectorizer. Other tfidf vectorizer would treat `html` & `[html` as separate words.

* Tried lemmatizer & stemmer but didn't help in imporving metrics so didn't included in final version.

In [6]:
def custom_tokenizer(text): 
    tokens=[]
    for token in text.split(" "):
        tmp=[ (sub_word) for _token in token.split(".") for sub_word in _token.split("_")]
        tokens.extend(tmp)                    
    return " ".join(tokens)


In [7]:
df["short_description"] = df.short_description.apply(lambda x : re.sub(r'[\[\]]',"" ,x))
df["short_description"] = df.short_description.apply(custom_tokenizer)
df.long_description=  df.long_description.fillna(" NONE ")
df.long_description = df.long_description.apply(lambda x : re.sub(r'[\[\]]',"" ,x))
df.long_description = df.long_description.apply(custom_tokenizer)



## created encoding for categorical features

In [8]:
map_dict_component_name={j.lower() :i+1  for i, j in enumerate(df["component_name"].unique())}
map_dict={j :i+1  for i, j in enumerate(df["severity_category"].unique())}


In [9]:
map_dict

{'normal': 1,
 'blocker': 2,
 'trivial': 3,
 'minor': 4,
 'major': 5,
 'critical': 6}

In [10]:
df["severity_category"]= df.severity_category.map(map_dict)
df["component_name"]= df.component_name.apply(lambda x: x.lower())
df["component_name"]= df.component_name.map(map_dict_component_name)

### Joined  short_description & long_description for tfidf vector training

In [11]:
df["text"]=df.apply(lambda x: "SHORT DESCRIPTION " + x["short_description"]+\
                        ". LONG DESCRIPTION "  + x["long_description"] , axis=1)


## Created Train test_split using stratify sampling 

In [12]:
append_in_train = df.product_name.value_counts()[df.product_name.value_counts()<10].index
update_train=df[df.product_name.isin(append_in_train)]
df=df[~df.product_name.isin(append_in_train)]
train_df , test_df = train_test_split(df,test_size=0.2,stratify=df.product_name,random_state=1234)

In [13]:
train_df.shape

(7600, 20)

In [14]:
train_df=pd.concat([train_df,update_train])
train_df.shape

(7876, 20)

##  Creating text based features 


In [15]:
tf_vect = TfidfVectorizer(stop_words="english",analyzer="word", ngram_range=(1,1) , min_df=4 ,max_df=0.6)

In [16]:
train_tf_vec=tf_vect.fit_transform(train_df.text,)
train_tf_vec

<7876x8054 sparse matrix of type '<class 'numpy.float64'>'
	with 285424 stored elements in Compressed Sparse Row format>

In [17]:
test_tf_vec = tf_vect.transform(test_df.text)

In [18]:
train_df.columns

Index(['creation_date', 'component_name', 'product_name', 'short_description',
       'long_description', 'assignee_name', 'reporter_name',
       'resolution_category', 'resolution_code', 'status_category',
       'status_code', 'update_date', 'quantity_of_votes',
       'quantity_of_comments', 'resolution_date', 'bug_fix_time',
       'severity_category', 'severity_code', 'year', 'text'],
      dtype='object')

## Adding Tablular features to data

In [19]:
tab_feature = ['component_name',  'severity_category']
train_df[tab_feature].head()

Unnamed: 0,component_name,severity_category
5131,33,1
1346,87,5
8019,144,6
3034,35,1
6120,231,1


In [20]:
train_feat=np.hstack(( train_df[tab_feature].values ,\
         train_tf_vec.toarray()
       ))

test_feat=np.hstack(( test_df[tab_feature].values ,\
         test_tf_vec.toarray()
       ))



## Building Random Forest classifier with class_wieght for imbalanced dataset

In [21]:
rf=RandomForestClassifier( n_estimators=50, random_state = 42,class_weight="balanced",max_depth=300)
rf.fit(train_feat,train_df.product_name)
preds=rf.predict(test_feat )


##  Metric 

* `accuracy : 62`
* `weighted_Precision : 67` 
* `weighted_Recall : 62`
* `weighted_F1_score : 61`


In [22]:
print(classification_report(test_df.product_name,preds,zero_division=1,))

              precision    recall  f1-score   support

   product_1       1.00      0.20      0.33         5
  product_10       0.79      0.87      0.83       110
 product_100       1.00      0.00      0.00         3
 product_101       1.00      0.00      0.00         4
 product_103       1.00      0.60      0.75         5
 product_104       0.80      0.57      0.67         7
 product_105       1.00      0.33      0.50         3
 product_106       0.62      0.73      0.67        11
 product_107       0.00      1.00      0.00         0
 product_108       0.80      0.67      0.73         6
 product_109       0.57      0.67      0.62         6
  product_11       0.00      1.00      0.00         0
 product_110       0.78      0.64      0.70        59
 product_111       1.00      0.50      0.67         2
 product_113       0.60      0.84      0.70        50
 product_114       1.00      0.00      0.00         6
 product_115       0.67      0.60      0.63        55
 product_116       0.77    

## Saving model

In [24]:
# import joblib 
# joblib.dump(rf,"APP/app/models/model.pkl")
# joblib.dump(tf_vect,"APP/app/models/tf_vectorizer.pkl")
# joblib.dump(map_dict,"APP/app/models/severity_category_mapping.pkl")
# joblib.dump(map_dict_component_name,"APP/app/models/component_name_mapping.pkl")

['APP/app/models/component_name_mapping.pkl']

## Instruction to run API

* USING DOCKER
    * cd project
    * docker-compose build
    * docker-compose up
    * http://localhost:8000/docs
* USING PYTHON
    * cd project
    * pip install -r requirements.txt
    * from folder project -> cd app
    * python main_app.py 
    * http://localhost:8000/docs