In [1]:
# DATA PRE-PROCESSING & FEATURE ENGINEERING


# 1. AYKIRI DEGER ANALIZI
# 2. EKSIK DEGER ANALIZI
# 3. LABEL ENCODING
# 4. ONE-HOT ENCODING
# 5. SUPER-CATEGORY CATCHING
# 6. RARE ENCODING
# 7. STANDARDIZATION
# 8. FEATURE ENGINEERING
# 9. RECAP

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import missingno as msno
import os
import sklearn
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
import warnings
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
warnings.simplefilter(action="ignore")

In [3]:
!pip install missingno



In [4]:
#pd.pandas.set_option('display.max_columns', None)
df = pd.read_csv("grad/cnc.csv")

In [5]:
failure_label = preprocessing.LabelEncoder()
df["failure"] = failure_label.fit_transform(df["failure"])

In [6]:
df.head()

Unnamed: 0,datetime,machineID,voltmean,rotatemean,pressuremean,vibrationmean,voltsd,rotatesd,pressuresd,vibrationsd,error1count,error2count,error3count,error4count,error5count,model,age,failure
0,2015-01-02T05:00:00Z,1,169.733809,445.179865,96.797113,40.38516,11.23312,48.717395,10.07988,5.853209,0.0,0.0,0.0,0.0,0.0,model3,18,0
1,2015-01-02T08:00:00Z,1,170.525721,443.906847,97.667249,39.78667,12.591948,46.930282,9.406795,6.098173,0.0,0.0,0.0,0.0,0.0,model3,18,0
2,2015-01-02T11:00:00Z,1,170.049722,446.461279,96.906162,40.016513,13.277336,42.83678,9.071472,5.481724,0.0,0.0,0.0,0.0,0.0,model3,18,0
3,2015-01-02T14:00:00Z,1,170.341974,447.355315,96.229522,39.921963,13.817158,42.808633,8.256794,5.862312,0.0,0.0,0.0,0.0,0.0,model3,18,0
4,2015-01-02T17:00:00Z,1,170.060643,452.163407,96.357441,39.99047,14.792869,42.525293,8.669605,5.907157,0.0,0.0,0.0,0.0,0.0,model3,18,0


## EKSİK DEĞER ANALİZİ

In [7]:
def missing_values_table(dataframe):
    
    variables_with_na = [col for col in dataframe.columns if dataframe[col].isnull().sum() > 0]
    
    n_miss = dataframe[variables_with_na].isnull().sum().sort_values(ascending=False)
    ratio = (dataframe[variables_with_na].isnull().sum() / dataframe.shape[0] * 100).sort_values(ascending=False)
    
    missing_df = pd.concat([n_miss, np.round(ratio, 2)], axis=1, keys=['n_miss', 'ratio'])
    print(missing_df)
    return variables_with_na

In [8]:
cols_with_na = missing_values_table(df)

Empty DataFrame
Columns: [n_miss, ratio]
Index: []


In [9]:
cols_with_na

[]

In [10]:
# NUMERIC TO CATEGORICAL: Bu da bir yöntemdir. Buna daha sonra one-hot uygula 1-0 yazsın
df.loc[(df['age'] >= 0) & (df['age'] < 6), 'NEW_AGE_CAT'] = 'yeni'
df.loc[(df['age'] >= 6) & (df['age'] <= 15), 'NEW_AGE_CAT'] = 'orta'
df.loc[(df['age'] >= 16), 'NEW_AGE_CAT'] = 'senior'

In [13]:
df.tail(25)

Unnamed: 0,datetime,machineID,voltmean,rotatemean,pressuremean,vibrationmean,voltsd,rotatesd,pressuresd,vibrationsd,error1count,error2count,error3count,error4count,error5count,model,age,failure,NEW_AGE_CAT
29105,2015-12-29T05:00:00Z,10,172.221604,459.849535,99.264958,41.846089,15.072897,56.217882,9.662575,4.643219,0.0,0.0,0.0,0.0,0.0,model3,10,0,orta
29106,2015-12-29T08:00:00Z,10,173.264406,460.19635,97.743404,41.776427,14.837848,59.136661,9.565774,4.459934,0.0,0.0,0.0,0.0,0.0,model3,10,0,orta
29107,2015-12-29T11:00:00Z,10,174.537323,468.784746,97.094641,41.688008,16.116546,57.072026,9.435047,4.152569,0.0,0.0,0.0,0.0,0.0,model3,10,0,orta
29108,2015-12-29T14:00:00Z,10,173.387197,460.83913,97.434309,41.990841,16.204903,58.67098,9.596835,3.548958,0.0,0.0,0.0,0.0,0.0,model3,10,0,orta
29109,2015-12-29T17:00:00Z,10,170.903685,461.789582,97.377142,42.035899,15.143889,58.676859,9.734394,3.445994,0.0,0.0,0.0,0.0,0.0,model3,10,0,orta
29110,2015-12-29T20:00:00Z,10,170.407286,461.836718,98.773072,41.504725,15.35324,57.471087,8.855759,3.896609,0.0,0.0,0.0,0.0,0.0,model3,10,0,orta
29111,2015-12-29T23:00:00Z,10,170.306833,461.588808,99.256066,42.367145,15.302676,59.759423,8.272807,5.099118,0.0,0.0,0.0,0.0,0.0,model3,10,0,orta
29112,2015-12-30T02:00:00Z,10,171.94781,456.053136,99.021762,42.694836,14.339163,57.519335,8.269846,5.013536,0.0,0.0,0.0,0.0,0.0,model3,10,0,orta
29113,2015-12-30T05:00:00Z,10,170.657112,462.095525,99.355855,41.782601,15.250832,55.714363,9.128267,5.531285,0.0,0.0,0.0,0.0,0.0,model3,10,0,orta
29114,2015-12-30T08:00:00Z,10,169.038072,456.880787,100.468406,41.399615,13.539352,46.323936,8.853942,5.656215,0.0,0.0,0.0,0.0,0.0,model3,10,0,orta


In [20]:
df.groupby("model").agg({"failure": "mean", "age": ["count", "mean"]})

Unnamed: 0_level_0,failure,age,age
Unnamed: 0_level_1,mean,count,mean
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
model3,0.015105,23304,11
model4,0.016478,5826,7


In [23]:
df.loc[((df['error1count'] + df['error2count'] + df['error3count'] + df['error4count'] + df['error5count']) >= 5), "NEW_IS_RISKY"] = "HIGH RISK"
df.loc[((df['error1count'] + df['error2count'] + df['error3count'] + df['error4count'] + df['error5count']) >= 1), "NEW_IS_RISKY"] = "RISK"
df.loc[((df['error1count'] + df['error2count'] + df['error3count'] + df['error4count'] + df['error5count']) == 0), "NEW_IS_RISKY"] = "LOW RISK"
df.head(25)

Unnamed: 0,datetime,machineID,voltmean,rotatemean,pressuremean,vibrationmean,voltsd,rotatesd,pressuresd,vibrationsd,error1count,error2count,error3count,error4count,error5count,model,age,failure,NEW_AGE_CAT,NEW_IS_RISKY
0,2015-01-02T05:00:00Z,1,169.733809,445.179865,96.797113,40.38516,11.23312,48.717395,10.07988,5.853209,0.0,0.0,0.0,0.0,0.0,model3,18,0,senior,LOW RISK
1,2015-01-02T08:00:00Z,1,170.525721,443.906847,97.667249,39.78667,12.591948,46.930282,9.406795,6.098173,0.0,0.0,0.0,0.0,0.0,model3,18,0,senior,LOW RISK
2,2015-01-02T11:00:00Z,1,170.049722,446.461279,96.906162,40.016513,13.277336,42.83678,9.071472,5.481724,0.0,0.0,0.0,0.0,0.0,model3,18,0,senior,LOW RISK
3,2015-01-02T14:00:00Z,1,170.341974,447.355315,96.229522,39.921963,13.817158,42.808633,8.256794,5.862312,0.0,0.0,0.0,0.0,0.0,model3,18,0,senior,LOW RISK
4,2015-01-02T17:00:00Z,1,170.060643,452.163407,96.357441,39.99047,14.792869,42.525293,8.669605,5.907157,0.0,0.0,0.0,0.0,0.0,model3,18,0,senior,LOW RISK
5,2015-01-02T20:00:00Z,1,169.369283,453.336163,98.042007,39.531667,15.674787,41.689624,10.607947,6.205887,0.0,0.0,0.0,0.0,0.0,model3,18,0,senior,LOW RISK
6,2015-01-02T23:00:00Z,1,169.795758,446.832666,98.454608,39.271645,15.742155,38.800266,11.679314,5.579524,0.0,0.0,0.0,0.0,0.0,model3,18,0,senior,LOW RISK
7,2015-01-03T02:00:00Z,1,170.162325,450.221017,100.880519,38.838137,15.80177,40.49,11.32393,5.565239,0.0,0.0,0.0,0.0,0.0,model3,18,0,senior,LOW RISK
8,2015-01-03T05:00:00Z,1,170.356866,454.242875,100.787669,38.471831,15.033766,40.858613,11.38492,5.12186,0.0,0.0,0.0,0.0,0.0,model3,18,0,senior,LOW RISK
9,2015-01-03T08:00:00Z,1,169.859196,462.466826,100.409064,40.20433,13.953164,44.269134,10.765817,6.023521,1.0,0.0,0.0,0.0,0.0,model3,18,0,senior,RISK
