In [59]:
import lightgbm as lgb
import pandas as pd
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import log_loss, mean_squared_error, accuracy_score
from sklearn.metrics import classification_report
from sklearn import metrics

pd.options.display.max_columns = None

# DATA pre-prosessing part

## Load data and labels

In [60]:
train_data = pd.read_csv('orange_small_train.data', sep = '\t')
appetency = pd.read_csv('orange_small_train_appetency_train.labels',header=None).astype('float')
churn = pd.read_csv('orange_small_train_churn_train.labels', header=None).astype('float')
upselling = pd.read_csv('orange_small_train_upselling_train.labels',header=None).astype('float')

test_data = pd.read_csv('orange_small_test.data', sep = '\t',header=None)
test_appetency = pd.read_csv('orange_small_train_appetency_test.labels',header=None).astype('float')
test_churn = pd.read_csv('orange_small_train_churn_test.labels',header=None).astype('float')
test_upselling = pd.read_csv('orange_small_train_upselling_test.labels',header=None).astype('float')

appetency.columns = ['appetency']
test_appetency.columns = ['appetency']
churn.columns = ['churn']
test_churn.columns = ['churn']
upselling.columns = ['upselling']
test_upselling.columns = ['upselling']

test_data.columns = train_data.columns
combine_data = pd.concat([train_data,test_data])
combine_appetency = pd.concat([appetency,test_appetency],axis=0,ignore_index=True)
combine_churn = pd.concat([churn,test_churn],axis=0,ignore_index=True)
combine_upselling = pd.concat([upselling,test_upselling],axis=0,ignore_index=True)

## Data clean

#### Take a look at the head 10 rows

In [61]:
train_data.head(5)

Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,Var11,Var12,Var13,Var14,Var15,Var16,Var17,Var18,Var19,Var20,Var21,Var22,Var23,Var24,Var25,Var26,Var27,Var28,Var29,Var30,Var31,Var32,Var33,Var34,Var35,Var36,Var37,Var38,Var39,Var40,Var41,Var42,Var43,Var44,Var45,Var46,Var47,Var48,Var49,Var50,Var51,Var52,Var53,Var54,Var55,Var56,Var57,Var58,Var59,Var60,Var61,Var62,Var63,Var64,Var65,Var66,Var67,Var68,Var69,Var70,Var71,Var72,Var73,Var74,Var75,Var76,Var77,Var78,Var79,Var80,Var81,Var82,Var83,Var84,Var85,Var86,Var87,Var88,Var89,Var90,Var91,Var92,Var93,Var94,Var95,Var96,Var97,Var98,Var99,Var100,Var101,Var102,Var103,Var104,Var105,Var106,Var107,Var108,Var109,Var110,Var111,Var112,Var113,Var114,Var115,Var116,Var117,Var118,Var119,Var120,Var121,Var122,Var123,Var124,Var125,Var126,Var127,Var128,Var129,Var130,Var131,Var132,Var133,Var134,Var135,Var136,Var137,Var138,Var139,Var140,Var141,Var142,Var143,Var144,Var145,Var146,Var147,Var148,Var149,Var150,Var151,Var152,Var153,Var154,Var155,Var156,Var157,Var158,Var159,Var160,Var161,Var162,Var163,Var164,Var165,Var166,Var167,Var168,Var169,Var170,Var171,Var172,Var173,Var174,Var175,Var176,Var177,Var178,Var179,Var180,Var181,Var182,Var183,Var184,Var185,Var186,Var187,Var188,Var189,Var190,Var191,Var192,Var193,Var194,Var195,Var196,Var197,Var198,Var199,Var200,Var201,Var202,Var203,Var204,Var205,Var206,Var207,Var208,Var209,Var210,Var211,Var212,Var213,Var214,Var215,Var216,Var217,Var218,Var219,Var220,Var221,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230
0,,,,,,1526.0,7.0,,,,,,184.0,,,,,,,,464.0,580.0,,14.0,128.0,,,166.56,,,,,,,0.0,,,3570.0,,,,,,0.0,,,,,,,,,,,,,4.076907,,,,,,,,9.0,,,,,,,,36,35.0,,1350864.0,,0.0,,,7333.11,,5.0,,12.0,,,,,,,,,,,,,,,,,,,,,,,,104.0,,,168.0,117625.6,,,,,,1175.0,,,,6.0,,720.0,8.0,,,,,,0.0,1212385.0,69134.0,,,,,,185.0,,,0.0,9.0,,,,,397579.0,,,,1812252.0,,,,,,,142.0,,,38418.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,462.0,,,bZkvyxLkBI,RO12,,taul,1K8T,lK27,ka_ns41,nQUveAzAF7,,,dXGu,9_Y1,FbIm,VpdQ,haYg,me75fM6ugJ,kIsH,,uKAI,L84s,XfqtO3UdzaXh_,,,,XTbPUYD,sH5Z,cJvF,FzaX,1YVfGrO,oslk,fXVEsaq,jySVZNlOJy,,,xb3V,RAYp,F2FyR07IdsN7I,,
1,,,,,,525.0,0.0,,,,,,0.0,,,,,,,,168.0,210.0,,2.0,24.0,,,353.52,,,,,,,0.0,,,4764966.0,,,,,,0.0,,,,,,,,,,,,,5.408032,,,,,,,,9.0,,,,,,,3.0,26,0.0,,2872928.0,,3.0,,,151098.9,,25.0,,2.0,,,,,,,,,58158.0,,,,,,,,,,,,,,,40.0,,,40.0,-356411.6,,,,,,590.0,,,,72.0,,0.0,,,,,,,8.0,4136430.0,357038.0,,,,,,0.0,,,0.0,9.0,,,,,278334.0,,,,10439160.0,,,,,,,32.0,,,238572.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,,CEat0G8rTN,RO12,,taul,1K8T,2Ix5,qEdASpP,y2LIM01bE1,,,lg1t,9_Y1,k13i,sJzTlal,zm5i,me75fM6ugJ,kIsH,,uKAI,L84s,NhsEn4L,,,,kZJyVg2,,,FzaX,0AJo2f2,oslk,2Kb5FSF,LM8l689qOp,,,fKCe,RAYp,F2FyR07IdsN7I,,
2,,,,,,5236.0,7.0,,,,,,904.0,,,,,,,,1212.0,1515.0,,26.0,816.0,,,220.08,,,,,,,0.0,,,5883894.0,,,,,,0.0,,,,,,,,,,,,,6.599658,,,,,,,,9.0,,,,,,,,130,518.0,,1675776.0,,0.0,,,16211.58,,40.0,,58.0,,,,,,,,,,,,,,,,,,,,,,,,312.0,,,336.0,405104.0,,,,,,3230.0,,,,114.0,,5967.0,-28.0,,,,,,0.0,3478905.0,248932.0,,,,,,800.0,,,0.0,36.0,,,,,320565.0,,,,9826360.0,,,,,,,206.0,,,434946.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,,eOQt0GoOh3,AERks4l,SEuy,taul,1K8T,ffXs,NldASpP,y4g9XoZ,vynJTq9,smXZ,4bTR,9_Y1,MGOA,VpdQ,haYg,DHn_WUyBhW_whjA88g9bvA64_,kIsH,,uKAI,L84s,UbxQ8lZ,,TTGHfSv,,pMWAe2U,bHR7,UYBR,FzaX,JFM1BiF,Al6ZaUT,NKv4yOc,jySVZNlOJy,,kG3k,Qu4f,02N6s8f,ib5G6X1eUxUn6,am7c,
3,,,,,,,0.0,,,,,,0.0,,,,,,,,,0.0,,,0.0,,,22.08,,,,,,,0.0,,,0.0,,,,,,0.0,,,,,,,,,,,,,1.98825,,,,,,,,9.0,,,,,,,,12,0.0,,0.0,,0.0,,,,,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,-275703.6,,,,,,,,,,0.0,,0.0,-14.0,,,,,,0.0,0.0,0.0,,,,,,0.0,,,0.0,,,,,,,,,,0.0,,,,,,,0.0,,,0.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,,jg69tYsGvO,RO12,,taul,1K8T,ssAy,_ybO0dd,4hMlgkf58mhwh,,,W8mQ,9_Y1,YULl,VpdQ,,me75fM6ugJ,kIsH,,uKAI,Mtgm,NhsEn4L,,,,kq0dQfu,eKej,UYBR,FzaX,L91KIiz,oslk,CE7uk3u,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,
4,,,,,,1029.0,7.0,,,,,,3216.0,,,,,,,,64.0,80.0,,4.0,64.0,,,200.0,,,,,,,0.0,,,0.0,,,,,,0.0,,,,,,,,,,,,,4.552446,,,,,,,,18.0,,,,,,,3.0,82,224.0,,784448.0,,0.0,,,37423.5,,0.0,,0.0,,,,,,,,,89754.0,,,,,,,,,,,,,,,32.0,,,56.0,10714.84,,,,,,215.0,,,,0.0,,15111.0,58.0,,,,,,0.0,150650.0,66046.0,,,,,,3255.0,,,0.0,9.0,,,,,267162.0,,,,644836.0,,,,,,,2.0,,,0.0,,,,,,,,,,0.0,,,,,,,,0.0,,,,,,,,,,,IXSgUHShse,RO12,SEuy,taul,1K8T,uNkU,EKR938I,ThrHXVS,0v21jmy,smXZ,xklU,9_Y1,RVjC,sJzTlal,6JmL,me75fM6ugJ,kIsH,,uKAI,L84s,XfqtO3UdzaXh_,,SJs3duv,,11p4mKe,H3p7,UYBR,FzaX,OrnLfvc,oslk,1J2cvxe,LM8l689qOp,,kG3k,FSa2,RAYp,F2FyR07IdsN7I,mj86,


#### Drop the columns whose loss rate > 20%

In [62]:
column_names = combine_data.columns #all rows has at least one feature...
drop_list = [x for x in column_names if len([ y for y in combine_data.loc[:,x].isna() if y == True]) > combine_data.shape[0]*0.2]
combine_data = combine_data.drop(drop_list,axis=1)

### Handle the missing numberic value

In [63]:
combine_data = combine_data.fillna(combine_data.mean())

### Handle non-numberic values
##### 1) Object ---> category
##### 2) For unique category > 30, make them one category
####  3) One-hot encoding

In [64]:
object_columns = combine_data.select_dtypes(['object']).columns
combine_data[object_columns] = combine_data[object_columns].fillna('missing_value')

In [None]:

combine_data[object_columns] = combine_data[object_columns].apply(lambda col: col.astype('category'))
for column in object_columns :
    if len(combine_data[column].cat.categories) > 30:
        #combine_data.drop([column],axis=1)
        category_dict = combine_data[column].value_counts().to_dict() 
        unavailable_categories = sorted(category_dict.keys(), key=lambda x: category_dict[x])[30:]
        for unavailable_category in unavailable_categories:
            combine_data[column] = combine_data[column].replace(unavailable_category,'extra_category')
combine_data = pd.get_dummies(combine_data)



#### drop categories > 30 not good.

In [None]:
'''

combine_data[object_columns] = combine_data[object_columns].fillna('missing_value')
combine_data[object_columns] = combine_data[object_columns].apply(lambda col: col.astype('category'))
combine_data[object_columns].describe()
drop_list = []
for column in object_columns :
    if len(combine_data[column].cat.categories) > 30:
        drop_list.append(column)

combine_data = combine_data.drop(drop_list,axis=1)
object_columns = combine_data.select_dtypes(['category']).columns
combine_data[object_columns].describe()
combine_data = pd.get_dummies(combine_data)
'''


#### drop all category columns, not good

In [65]:

object_columns = combine_data.select_dtypes(['object']).columns
combine_data = combine_data.drop(object_columns,axis=1)


### Rebalance

#### See from the figure that this is a very imbalanced data. Need to rebanlance.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
fig = plt.figure(1,figsize=(15,6))
plt.subplot(131)
plt.title("appetency")
plt.bar([0,1],[len(appetency[appetency.appetency <= 0]),len(appetency[appetency.appetency > 0])])
plt.subplot(132)
plt.title("churn")
plt.bar([0,1],[len(appetency[churn.churn <= 0]),len(appetency[churn.churn > 0])])
plt.subplot(133)
plt.title("upselling")
plt.bar([0,1],[len(appetency[upselling.upselling <= 0]),len(appetency[upselling.upselling > 0])])

In [66]:

combine_churn = (combine_churn + 1)/2
combine_upselling = (combine_upselling + 1)/2
combine_appetency = (combine_appetency + 1)/2

processed_train_data = combine_data.iloc[0:25000]
processed_test_data = combine_data.iloc[25000:]

processed_label_train_churn = combine_churn.iloc[0:25000]
processed_label_train_upselling = combine_upselling.iloc[0:25000]
processed_label_train_appetency = combine_appetency.iloc[0:25000]

processed_label_test_churn = combine_churn.iloc[25000:]
processed_label_test_upselling = combine_upselling.iloc[25000:]
processed_label_test_appetency = combine_appetency.iloc[25000:]

#### Random up-sampling, not good.  
#### Random down-sampling, better, but still not good enough, the negative class f1-score drops dramatically.

In [None]:
'''
from sklearn import preprocessing
from sklearn.utils import resample
RANDOM_SEED = 42



processed_train_data = pd.concat([processed_train_data,processed_label_train_appetency],axis=1)
#datatrain = data.drop(['churn','upselling'],axis=1)
df_maj = processed_train_data[processed_train_data.appetency==0]
df_min = processed_train_data[processed_train_data.appetency==1]
'''
'''
# Upsample minority class

df_min_upsampled = resample(df_min, 
                                replace=True,     # sample with replacement
                                 n_samples=df_maj.shape[0],    # to match majority class
                                 random_state=RANDOM_SEED) # reproducible results

df_upsampled = pd.concat([df_maj, df_min_upsampled])
df_X = df_upsampled.drop(['appetency'],axis=1)
df_y = df_upsampled['appetency']


X_train, X_test, y_train, y_test = train_test_split(df_X,df_y ,test_size=0.2, random_state=RANDOM_SEED)
X_train = X_train.values
X_test = X_test.values
y_train = y_train
y_test = y_test

X_train = preprocessing.scale(X_train)
X_test = preprocessing.scale(X_test)
'''
'''
#Down sample
df_maj_downsampled = resample(df_maj, 
                                replace=True,     # sample with replacement
                                 n_samples=df_min.shape[0],    # to match majority class
                                 random_state=RANDOM_SEED) # reproducible results

df_downsampled = pd.concat([df_min, df_maj_downsampled])
df_X = df_downsampled.drop(['appetency'],axis=1)
df_y = df_downsampled['appetency']
'''


#### Choose the best majority rows. The best rows are the rows whose loss rate of column data is least. The minority class has 480 rows. And select the best majority 480 rows together to make the training data.  However, result is not good. Abandoned.

In [None]:
'''
appetency_negative_num = len(appetency[appetency.appetency == -1])
appetency_positive_num = len(appetency[appetency.appetency == 1])
churn_negative_num = len(churn[churn.churn == 0])
churn_positive_num = len(churn[churn.churn == 1])
avai_data_num = []
for y in train_data.index :
    avai_data_num.append(train_data.loc[y].isnull().sum())

avai_data_num = sorted(range(len(avai_data_num)), key=lambda x: avai_data_num[x])[-(appetency_positive_num):]
#train_data = train_data.iloc[avai_data_num]
'''

In [None]:
### Finally, using SMOTE for up-samplling

In [67]:
from imblearn.over_sampling import SMOTE
from sklearn.utils.validation import column_or_1d
sm = SMOTE(random_state=42,ratio={1:5000})
df_X,df_y = sm.fit_resample(processed_train_data, processed_label_train_appetency)

  y = column_or_1d(y, warn=True)


# Weighted random forest part

In [68]:
from sklearn.ensemble import RandomForestClassifier

In [69]:
from sklearn.model_selection import GridSearchCV

model_rf = RandomForestClassifier(n_estimators=20,class_weight='balanced_subsample',max_depth = 10, max_features='sqrt',min_samples_split=120,min_samples_leaf=40)#class_weight='balanced')
#model_rf = RandomForestClassifier()
model_rf.fit(df_X,df_y)
result = model_rf.predict(processed_test_data)
print(classification_report(processed_label_test_appetency,result))




              precision    recall  f1-score   support

         0.0       0.98      0.92      0.95     24548
         1.0       0.03      0.15      0.05       452

    accuracy                           0.90     25000
   macro avg       0.51      0.53      0.50     25000
weighted avg       0.97      0.90      0.93     25000



In [None]:
from sklearn.model_selection import GridSearchCV

class_weight={0 : 1, 1 : 5}
model_rf = RandomForestClassifier(class_weight = class_weight) #class_weight='balanced')
#model_rf = RandomForestClassifier()
model_rf.fit(train_data,processed_label_train_appetency)
result = model_rf.predict(processed_test_data)
print(classification_report(processed_label_test_appetency,result))

In [None]:
combine_data.shape