In [1]:
#!sudo pip3 install pandas
#!sudo pip3 install sklearn
#!sudo pip3 install graphviz
#!sudo pip3 install seaborn

In [2]:
import pandas as pd

# I. Traitement des données

### 1. Importation du dataset contenant les données débloatées 
(Plusieurs colonnes normalisées (caller_id, opened_by, sys_created_by, sys_updated_by, location, category, subcategory, u_symptom, cmdb_ci, assignement_group, assigned_to, problem_id, rfc, vendor, closed_code, resolved_by ne contiennent plus que des valeurs numériques, sans perte d'information). Les "?" ont été remplacés par des cases vides. 
De plus, les variables de type object ou string ont été converties en float ou int, suivant le cas.

In [3]:
#La colonne "caused_by" a été supprimée car vide
dataset = pd.read_csv("incident_event_log_debloated.csv", sep=',', encoding='UTF-8')
dataset

Unnamed: 0,number,incident_state,active,reassignment_count,reopen_count,sys_mod_count,made_sla,caller_id,opened_by,opened_at,...,knowledge,u_priority_confirmation,notify,problem_id,rfc,vendor,closed_code,resolved_by,resolved_at,closed_at
0,INC0000045,New,True,0,0,0,True,2403.0,8.0,29/2/2016 01:16,...,True,False,Do Not Notify,,,,5.0,149.0,29/2/2016 11:29,5/3/2016 12:00
1,INC0000045,Resolved,True,0,0,2,True,2403.0,8.0,29/2/2016 01:16,...,True,False,Do Not Notify,,,,5.0,149.0,29/2/2016 11:29,5/3/2016 12:00
2,INC0000045,Resolved,True,0,0,3,True,2403.0,8.0,29/2/2016 01:16,...,True,False,Do Not Notify,,,,5.0,149.0,29/2/2016 11:29,5/3/2016 12:00
3,INC0000045,Closed,False,0,0,4,True,2403.0,8.0,29/2/2016 01:16,...,True,False,Do Not Notify,,,,5.0,149.0,29/2/2016 11:29,5/3/2016 12:00
4,INC0000047,New,True,0,0,0,True,2403.0,397.0,29/2/2016 04:40,...,True,False,Do Not Notify,,,,5.0,81.0,1/3/2016 09:52,6/3/2016 10:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
141707,INC0120835,Closed,False,1,0,4,True,116.0,12.0,16/2/2017 09:09,...,False,True,Do Not Notify,,,,9.0,9.0,16/2/2017 09:53,16/2/2017 09:53
141708,INC0121064,Active,True,0,0,0,True,116.0,12.0,16/2/2017 14:17,...,False,False,Do Not Notify,,,,6.0,9.0,16/2/2017 16:38,16/2/2017 16:38
141709,INC0121064,Active,True,1,0,1,True,116.0,12.0,16/2/2017 14:17,...,False,False,Do Not Notify,,,,6.0,9.0,16/2/2017 16:38,16/2/2017 16:38
141710,INC0121064,Resolved,True,1,0,2,True,116.0,12.0,16/2/2017 14:17,...,False,True,Do Not Notify,,,,6.0,9.0,16/2/2017 16:38,16/2/2017 16:38


In [4]:
dataset.dtypes

number                      object
incident_state              object
active                        bool
reassignment_count           int64
reopen_count                 int64
sys_mod_count                int64
made_sla                      bool
caller_id                  float64
opened_by                  float64
opened_at                   object
sys_created_by             float64
sys_created_at              object
sys_updated_by               int64
sys_updated_at              object
contact_type                object
location                   float64
category                   float64
subcategory                float64
u_symptom                  float64
cmdb_ci                    float64
impact                      object
urgency                     object
priority                    object
assignment_group           float64
assigned_to                float64
knowledge                     bool
u_priority_confirmation       bool
notify                      object
problem_id          

### 2. Conversion des types de données erronés
On va supprimer certaines colonnes qui sont inutiles pour notre régression, comme l'ID de l'incident et le RFC.
Pandas a également mal interprété le type de données de certaines colonnes, on applique donc une correction.

In [5]:
dataset = dataset.drop(columns="rfc")
dataset.active = dataset.active.astype(int)
dataset.made_sla = dataset.made_sla.astype(int)
print(dataset.incident_state.value_counts())
incident_state_dico = {"New":0, "Resolved":1, "Closed":2, "Active":3, "Awaiting User Info":4, "Awaiting Vendor":5, "Awaiting Problem":6, "Awaiting Evidence":7, "-100":8}

Active                38716
New                   36407
Resolved              25751
Closed                24985
Awaiting User Info    14642
Awaiting Vendor         707
Awaiting Problem        461
Awaiting Evidence        38
-100                      5
Name: incident_state, dtype: int64


In [6]:
#dataset.replace({"incident_state": incident_state_dico})
dataset.incident_state = dataset.incident_state.map(incident_state_dico)

In [7]:
print(dataset.contact_type.value_counts())
contact_type_dico = {"Phone":0, "Self service":1, "Email":2, "IVR":3, "Direct opening":4}
dataset.contact_type = dataset.contact_type.map(contact_type_dico)
dataset.contact_type

Phone             140462
Self service         995
Email                220
IVR                   18
Direct opening        17
Name: contact_type, dtype: int64


0         0
1         0
2         0
3         0
4         0
         ..
141707    2
141708    2
141709    2
141710    2
141711    2
Name: contact_type, Length: 141712, dtype: int64

In [8]:
print(dataset.impact.value_counts())
impact_dico = {"2 - Medium":2, "3 - Low":3, "1 - High":1}
dataset.impact = dataset.impact.map(impact_dico)
dataset.urgency = dataset.urgency.map(impact_dico)

2 - Medium    134335
3 - Low         3886
1 - High        3491
Name: impact, dtype: int64


In [9]:
print(dataset.priority.value_counts())
priority_dico = {"3 - Moderate":3, "4 - Low":4, "2 - High":2, "1 - Critical":1}
dataset.priority = dataset.priority.map(priority_dico)

3 - Moderate    132452
4 - Low           4030
2 - High          2972
1 - Critical      2258
Name: priority, dtype: int64


In [10]:
dataset.knowledge = dataset.knowledge.astype(int)
dataset.u_priority_confirmation = dataset.u_priority_confirmation.astype(int)

In [11]:
print(dataset.notify.value_counts())
notify_dico = {"Do Not Notify":0, "Send Email":1}
dataset.notify = dataset.notify.map(notify_dico)

Do Not Notify    141593
Send Email          119
Name: notify, dtype: int64


In [12]:
print(dataset.vendor.value_counts())
vendor_dico = {"8s":0, "Vendor 1":1, "Vendor 2":2, "Vendor 3":3}
dataset.vendor = dataset.vendor.map(vendor_dico)

8s          167
Vendor 1     69
Vendor 3      6
Vendor 2      2
Name: vendor, dtype: int64


In [13]:
dataset.dtypes

number                      object
incident_state               int64
active                       int64
reassignment_count           int64
reopen_count                 int64
sys_mod_count                int64
made_sla                     int64
caller_id                  float64
opened_by                  float64
opened_at                   object
sys_created_by             float64
sys_created_at              object
sys_updated_by               int64
sys_updated_at              object
contact_type                 int64
location                   float64
category                   float64
subcategory                float64
u_symptom                  float64
cmdb_ci                    float64
impact                       int64
urgency                      int64
priority                     int64
assignment_group           float64
assigned_to                float64
knowledge                    int64
u_priority_confirmation      int64
notify                       int64
problem_id          

In [14]:
dataset.shape

(141712, 34)

### 3. On cherche le temps de résolution pour chaque ligne
Pour chaque ligne, on calcule la différence de temps entre la résolution et l'ouverture de l'incident. Ces informations sont rangées dans une liste de la même longueur que le dataset.

In [15]:
Y_list = list()
def resolved_time(row):    
    cols = ['opened_at','closed_at']
    #cols = ['opened_at','resolved_at']
    opened_at = pd.to_datetime(row[cols[0]])
    closed_at = pd.to_datetime(row[cols[1]])
    #resolved_at = pd.to_datetime(row[cols[1]])
    
    time_completion = (closed_at - opened_at)
    #time_completion = (resolved_at - opened_at)
    
    #if time_completion > datetime.timedelta(seconds=0): 
    #    Y_list.append(time_completion)
    #else:
    #    Y_list.append(-1)
    Y_list.append(time_completion)
    
    
for index, row in dataset.iterrows():
    resolved_time(row)

KeyboardInterrupt: 

In [None]:
len(Y_list)


In [None]:
Y_list

### 4. On controle s'il y a des valeurs négatives. Nous allons donc remplacer ces valeurs négatives par "-1" afin d'avoir des données logiques.

In [None]:
import numpy as np
import datetime
minutesList = list()
length = len(dataset) 
for i in range(length):
    if Y_list[i] > datetime.timedelta(seconds=0):
        minutesList.append(Y_list[i].total_seconds()/60)
    else:
        minutesList.append(-1)

duration = np.ravel(pd.DataFrame(minutesList))
duration[:20]
dataset["resolved_in"] = duration

In [None]:
dataset.dtypes

In [None]:
dataset = dataset.drop(columns="number")

### 5. On garde seulement les incidents ayant pour statut "Closed"

On garde seulement les lignes ayant une valeur correcte pour resolved_in

In [None]:
dataset

Après avoir trouvé ces temps, on va convertir les colonnes de type date (en string) en float.

In [None]:
dataset.opened_at = pd.to_datetime(dataset.opened_at, errors='coerce')
dataset.sys_created_at = pd.to_datetime(dataset.sys_created_at, errors='coerce')
dataset.sys_updated_at = pd.to_datetime(dataset.sys_updated_at, errors='coerce')
dataset.resolved_at = pd.to_datetime(dataset.resolved_at, errors='coerce')
dataset.closed_at = pd.to_datetime(dataset.closed_at, errors='coerce')

### 6. On regarde si certaines colonnes ont trop de valeurs manquantes (et ne sont donc pas utilisables). Si plus de 30% des données de la colonne sont manquantes, on la supprime.

In [None]:
import math
total = len(dataset["incident_state"])
#print(dataset["vendor"][3])
for column in dataset.columns:
    v = 0
    for i in range(total):
        if dataset[column][i] == None:
            v+=1
        elif dataset[column].dtype == int or dataset[column].dtype == float:
            if math.isnan(dataset[column][i]):
                v+=1
    if v/total*100 > 30:
        print("Column name: ", column, " ; Empty: ", v/total*100, "%")
        dataset = dataset.drop(columns=column)
dataset.dtypes

In [None]:
backup = dataset

In [None]:
dataset = backup

# II. Régressions

### 1. Si on a des valeurs infinies dans le dataset, elles sont remplacées par NaN. Ensuite, les valeurs NaN du dataset sont remplacées par une moyenne.

In [None]:
from datetime import datetime
dataset[dataset==np.inf]=np.nan
dataset.fillna(dataset.mean(), inplace=True)

#for i in range(len(dataset['sys_updated_at'])):
#    dataset.sys_updated_at[i] = datetime.timestamp(dataset.sys_updated_at[i])
#dataset.sys_updated_at = datetime.timestamp(dataset.sys_updated_at)

### 2. On crée nos X et Y en supprimant les colonnes nécessaires

In [None]:
y = dataset.resolved_in
x = dataset.drop('resolved_in', axis=1)
x = x.drop('resolved_at', axis=1)
x = x.drop('closed_at', axis=1)
x = x.drop('opened_at', axis=1)
x = x.drop('sys_created_at', axis = 1)
x = x.drop('sys_updated_at', axis=1)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
heat = dataset.drop('resolved_at', axis=1)
heat = heat.drop('closed_at', axis=1)
heat = heat.drop('opened_at', axis=1)
heat = heat.drop('sys_created_at', axis = 1)
heat = heat.drop('sys_updated_at', axis=1)

f, ax = plt.subplots(figsize=(10, 8))
corr = heat.corr()
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)


In [None]:
dataset

In [None]:
x

### 3. On sépare nos données en un dataset de train et de test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(x,y,test_size=0.2, random_state=0)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape


### 4. On réinitialise les index de test afin de pouvoir avoir des plots propres

In [None]:
X_test = X_test.reset_index(drop=True)
Y_test = Y_test.reset_index(drop=True)

Test pour déterminer les variables donnant le meilleur score.

Optimisation

Meilleurs paramètres trouvés:
{'criterion': 'mse', 'max_depth': 6, 'max_leaf_nodes': 100, 'min_samples_leaf': 20, 'min_samples_split': 10}

#### 4.1 Scaling

### 5. Decision Tree Regressor

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

#regressor = DecisionTreeRegressor(criterion='mse', max_depth=6, max_leaf_nodes=100, min_samples_leaf=20, min_samples_split=10)
regressor = DecisionTreeRegressor()
#cross_val_score(regressor, X_train, Y_train, cv=10)
regressor.fit(X_train, Y_train)

In [None]:
importances = pd.Series(regressor.feature_importances_, index=X_train.columns)
importances.nlargest(20).plot(kind='barh')

In [None]:
regressor.score(X_test, Y_test)

In [None]:
y_pred = regressor.predict(X_test)

In [None]:
df=pd.DataFrame({'Actual':Y_test, 'Predicted':y_pred})
df

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, y_pred)))

In [None]:

import matplotlib.pyplot as plt 

plt.figure()
plt.scatter(X_test.index[2000:2100], Y_test[2000:2100], s=20, edgecolor="black",
            c="darkorange", label="original data")
plt.plot(X_test.index[2000:2100], y_pred[2000:2100], color="cornflowerblue",
         label="prediction", linewidth=1)
plt.xlabel("data")
plt.ylabel("Resolution time")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

### 6. Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor



#randomForestRegressor = RandomForestRegressor(n_estimators=600, min_samples_split=2, min_samples_leaf=2, max_features='sqrt', max_depth=110, bootstrap=False, random_state=100)
randomForestRegressor = RandomForestRegressor()
randomForestRegressor.fit(X_train,Y_train)

In [None]:
importances = pd.Series(randomForestRegressor.feature_importances_, index=X_train.columns)
importances.nlargest(20).plot(kind='barh')

In [None]:
randomForestRegressor.score(X_test, Y_test)

In [None]:
yRFR = randomForestRegressor.predict(X_test)

In [None]:
df=pd.DataFrame({'Actual':Y_test, 'Predicted':yRFR})
df

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, yRFR))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, yRFR))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, yRFR)))

In [None]:
import matplotlib.pyplot as plt 

plt.figure()
plt.scatter(X_test.index[2131:2180], Y_test[2131:2180], s=20, edgecolor="black",
            c="darkorange", label="original data")
plt.plot(X_test.index[2131:2180], yRFR[2131:2180], color="cornflowerblue",
         label="prediction", linewidth=1)
plt.xlabel("data")
plt.ylabel("Resolution time")
plt.title("Random Forest Regressor")
plt.legend()
plt.show()

### 7. Support Vector Regression

SVR Linear (Ne fonctionne pas)

SVR RBF

### 8. AdaBoost Regressor with decision tree

In [None]:
from sklearn.ensemble import AdaBoostRegressor

adaBoostRegressor = AdaBoostRegressor(n_estimators=150,random_state=0)
#adaBoostRegressor = AdaBoostRegressor()
adaBoostRegressor.fit(X_train,Y_train)


In [None]:
importances = pd.Series(adaBoostRegressor.feature_importances_, index=X_train.columns)
importances.nlargest(20).plot(kind='barh')

In [None]:
adaBoostRegressor.score(X_test, Y_test)

In [None]:
yAda = adaBoostRegressor.predict(X_test)

In [None]:
df=pd.DataFrame({'Actual':Y_test, 'Predicted':yAda})
df

In [None]:
import matplotlib.pyplot as plt 

plt.figure()
plt.scatter(X_test.index[2131:2180], Y_test[2131:2180], s=20, edgecolor="black",
            c="darkorange", label="original data")
plt.plot(X_test.index[2131:2180], yAda[2131:2180], color="cornflowerblue",
         label="prediction", linewidth=1)
plt.xlabel("data")
plt.ylabel("Resolution time")
plt.title("AdaBoost Regressor")
plt.legend()
plt.show()

### 9. GradientBoost Regressor

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gradientBoostRegressor = GradientBoostingRegressor(random_state=1, n_estimators=150)
#gradientBoostRegressor = GradientBoostingRegressor()
gradientBoostRegressor.fit(X_train,Y_train)

In [None]:
importances = pd.Series(gradientBoostRegressor.feature_importances_, index=X_train.columns)
importances.nlargest(20).plot(kind='barh')

In [None]:
gradientBoostRegressor.score(X_test, Y_test)

In [None]:
yGrad = gradientBoostRegressor.predict(X_test)

In [None]:
df=pd.DataFrame({'Actual':Y_test, 'Predicted':yGrad})
df

In [None]:
import matplotlib.pyplot as plt 

plt.figure()
plt.scatter(X_test.index[2131:2180], Y_test[2131:2180], s=20, edgecolor="black",
            c="darkorange", label="original data")
plt.plot(X_test.index[2131:2180], yGrad[2131:2180], color="cornflowerblue",
         label="prediction", linewidth=1)
plt.xlabel("data")
plt.ylabel("Resolution time")
plt.title("GradientBoost Regressor")
plt.legend()
plt.show()

### 10. Linear Regressor

In [None]:
from sklearn.linear_model import LinearRegression

linearRegressor = LinearRegression()
linearRegressor.fit(X_train,Y_train)

In [None]:
importances = pd.Series(linearRegressor.singular_, index=X_train.columns)
importances.nlargest(20).plot(kind='barh')

In [None]:
linearRegressor.score(X_test, Y_test)

In [None]:
linGrad = linearRegressor.predict(X_test)

In [None]:
df=pd.DataFrame({'Actual':Y_test, 'Predicted':linGrad})
df

In [None]:
import matplotlib.pyplot as plt 

plt.figure()
plt.scatter(X_test.index[2131:2180], Y_test[2131:2180], s=20, edgecolor="black",
            c="darkorange", label="original data")
plt.plot(X_test.index[2131:2180], linGrad[2131:2180], color="cornflowerblue",
         label="prediction", linewidth=1)
plt.xlabel("data")
plt.ylabel("Resolution time")
plt.title("GradientBoost Regressor")
plt.legend()
plt.show()

# III. Résultats finaux

### 1. Comparaison des différents modèles

In [None]:
import matplotlib.pyplot as plt 

plt.figure()
plt.scatter(X_test.index[2131:2145], Y_test[2131:2145], s=20, edgecolor="black",
            c="darkorange", label="original data")
plt.plot(X_test.index[2131:2145], y_pred[2131:2145], color="red",
         label="prediction Decision Tree Regressor", linewidth=1)
plt.plot(X_test.index[2131:2145], yRFR[2131:2145], color="cornflowerblue",
         label="prediction Random Forest Regressor", linewidth=1)
#plt.plot(X_test.index[2131:2145], yRBF[2131:2145], color="green",
#         label="prediction RBF Regressor", linewidth=1)
plt.plot(X_test.index[2131:2145], yAda[2131:2145], color="purple",
         label="prediction AdaBoost Regressor", linewidth=1)
plt.plot(X_test.index[2131:2145], yGrad[2131:2145], color="yellow",
         label="prediction GradientBoost Regressor", linewidth=1)
plt.plot(X_test.index[2131:2145], linGrad[2131:2145], color="green",
         label="prediction Linear Regressor", linewidth=1)
plt.xlabel("data")
plt.ylabel("Resolution time")
plt.title("Decision Tree Regression")
plt.legend()
plt.show()

Sur ce graphe assez réduit, DecisionTreeRegressor et RandomForestRegressor ont les résultats les plus proches de la réalité.

Comparons les scores:

In [None]:
print("Decision Tree Regressor: ", regressor.score(X_test, Y_test))
print("Random Forest Regressor: ", randomForestRegressor.score(X_test, Y_test))
#print("RBF Regressor: ", rbfRegressor.score(X_test, Y_test))
print("AdaBoost Regressor: ", adaBoostRegressor.score(X_test, Y_test))
print("GradientBoost Regressor: ", gradientBoostRegressor.score(X_test, Y_test))
print("Linear Regressor: ", linearRegressor.score(X_test, Y_test))

Ces résultats viennent confirmer le ressenti du graphe: Les scores les plus élevés sont Random Forest et Decision Tree.