## Building a Predictive Model to predict the Likelihood of a Customer Churning
### IMPORT LIBRARIES AND READ DATASET

In [3]:
import pandas
import numpy
import matplotlib.pyplot as plt
import sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

df = pandas.read_csv('online_retail.csv',encoding_errors="ignore")
print(df)

       InvoiceNo StockCode                          Description  Quantity  \
0         536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1         536365     71053                  WHITE METAL LANTERN         6   
2         536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3         536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4         536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   
...          ...       ...                                  ...       ...   
541904    581587     22613          PACK OF 20 SPACEBOY NAPKINS        12   
541905    581587     22899         CHILDREN'S APRON DOLLY GIRL          6   
541906    581587     23254        CHILDRENS CUTLERY DOLLY GIRL          4   
541907    581587     23255      CHILDRENS CUTLERY CIRCUS PARADE         4   
541908    581587     22138        BAKING SET 9 PIECE RETROSPOT          3   

             InvoiceDate  UnitPrice  CustomerID         Country  
0       0

### DROPPING DUPLICATES AND NULL VALUES

In [4]:
custlist=[]
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
c1=df['CustomerID'].tolist()
c1=set(c1)
#print(set(c1))

### STORING CUSTOMER ID VALUES THAT MATCH DATE REQUIREMENT

In [5]:
c2=[]
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)
for row in df.itertuples():
    row=list(row)
    if(pandas.to_datetime(row[5].split(' ')[0]) > pandas.Timestamp('2011-12-31T12')):
        continue
    if(pandas.to_datetime(row[5].split(' ')[0]) < pandas.Timestamp('2010-12-01T12')):
        continue
    else:
        c2.append(row[7])
        #print(row[7])
c2=set(c2)
#print(c2)

### UPDATING CLASS LABEL VALUES

In [7]:
df1=[]
for row in df.itertuples():
    row=list(row)
    if(row[7] in c2):
        row.append("churn")
    else:
        row.append("not churn")
    df1.append(row)
df1=pandas.DataFrame(df1)
#print(df1)
df1.to_csv('set1.csv')

### FINALIZING THE INPUT, CLASS LABEL ARRAYS TO THE ML MODELS

In [8]:
y=df1[9].tolist()
x=[]
price=df1[6].tolist()
quantity=df1[4].tolist()
i=0
for i in range(0,len(price)):
    prod=price[i]*quantity[i]
    x.append(prod)
#print(x)




### DECISION TREE CLASSIFIER ( DECLARE AND FIT )

In [9]:
y=numpy.array(y).reshape(-1,1)
x=numpy.array(x).reshape(-1,1)
x_train, x_test,y_train, y_test = train_test_split(x,y ,random_state=104, test_size=0.25, shuffle=True)
dt=DecisionTreeClassifier(random_state=20,max_depth=100)
dt.fit(x_train,y_train)


### DECISION TREE CLASSIFIER ( PREDICT AND ACCURACY )

In [10]:
preds=dt.predict(x_test)
print(dt.predict([[22.5]]))
print(accuracy_score(y_test,preds)*100)

['churn']
99.59064152747482


### ARTIFICIAL NEURAL NETWORK ( DECALRE AND FIT )

In [11]:
ann=MLPClassifier(hidden_layer_sizes=40,activation="tanh",solver="adam",verbose=True)
ann.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


Iteration 1, loss = 0.07045020
Iteration 2, loss = 0.02607090
Iteration 3, loss = 0.02546301
Iteration 4, loss = 0.02547555
Iteration 5, loss = 0.02534811
Iteration 6, loss = 0.02538371
Iteration 7, loss = 0.02534115
Iteration 8, loss = 0.02529923
Iteration 9, loss = 0.02504942
Iteration 10, loss = 0.02505772
Iteration 11, loss = 0.02513922
Iteration 12, loss = 0.02504456
Iteration 13, loss = 0.02486534
Iteration 14, loss = 0.02494859
Iteration 15, loss = 0.02501595
Iteration 16, loss = 0.02497372
Iteration 17, loss = 0.02494626
Iteration 18, loss = 0.02491123
Iteration 19, loss = 0.02502154
Iteration 20, loss = 0.02507458
Iteration 21, loss = 0.02497104
Iteration 22, loss = 0.02480392
Iteration 23, loss = 0.02484383
Iteration 24, loss = 0.02485103
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


### ARTIFICIAL NEURAL NETWORK ( PREDICT AND ACCURACY )

In [12]:
predsann=ann.predict(x_test)
print(accuracy_score(predsann,y_test))
print(ann.predict([[22.5]]))

0.9960060158763359
['churn']


### SUPPORT VECTOR MACHINE ( DECLARE AND FIT )

In [13]:
lsvc=LinearSVC(random_state=42,max_iter=1500)
lsvc.fit(x_train,y_train)

  y = column_or_1d(y, warn=True)


### SUPPORT VECTOR MACHINE ( PREDICT AND ACCURACY )

In [15]:
predsvc=lsvc.predict(x_test)
print(lsvc.predict([[22.5]]))
print(accuracy_score(predsvc,y_test))

['churn']
0.9959860957560184


### COMPARISON TABLE

In [16]:
dict={"models":["dt","svm","ann"],"accuracy":[accuracy_score(predsvc,y_test),accuracy_score(predsann,y_test),accuracy_score(y_test,preds)]}
comparisontable=pandas.DataFrame(dict)
print(comparisontable)

  models  accuracy
0     dt  0.995986
1    svm  0.996006
2    ann  0.995906
