In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import json 

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split


from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sbhatt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Summarization

In [2]:
#Reading the input file to train the model which has around 1600 accepted leads and 1600 rejected leads 
registeredLeads = "registered_leads.json"
exportPath = "testing_registered_leads.json"


with open(registeredLeads,'r',encoding='utf-8') as f1:
    data_fromFile = f1.read()
    data_fromFile = '[' + data_fromFile.replace('\n', ',\n') + ']'

with open(exportPath,'w',encoding='utf-8') as f2:
    f2.write(data_fromFile)
    
with open(exportPath,'r',encoding='utf-8') as f3:
    jsonObject = json.load(f3)
    registeredLeadDf = pd.DataFrame(jsonObject, columns=['leadName','gender','city','reference','notes','academicYear','englishYear'])
    normalize = pd.json_normalize(jsonObject,record_path = 'followUpLog',meta=['leadName','dob','gender','city','reference','academicYear','englishYear','notes'],errors = 'ignore')
                
    normalizedRegisteredLeadDf = normalize.drop(columns = ['contactNumberMail','nextFollowUp','dob','user','contactLogResponse','contactTime.$numberInt','nextFollowUp.$numberInt',
                                    'successful.$numberInt','nextFollowUp.$numberLong','successful','callDuration'])
    
normalizedRegisteredLeadDf

Unnamed: 0,remarks,leadName,gender,city,reference,academicYear,englishYear,notes
0,tomarrow coming,Adithya,M,ERNAKULAM,Online,2017-18,2017,"MBA Jain University,MS RAMAIAH"
1,Today coming,Adithya,M,ERNAKULAM,Online,2017-18,2017,"MBA Jain University,MS RAMAIAH"
2,today coming,Adithya,M,ERNAKULAM,Online,2017-18,2017,"MBA Jain University,MS RAMAIAH"
3,just taking decision,Adithya,M,ERNAKULAM,Online,2017-18,2017,"MBA Jain University,MS RAMAIAH"
4,formal talk,Adithya,M,ERNAKULAM,Online,2017-18,2017,"MBA Jain University,MS RAMAIAH"
...,...,...,...,...,...,...,...,...
29112,XTH 95% XII EXPECTING 95% TRANSFERRED RS 3540 ...,Darshan Suresh Shetty,F,UK,Galaxy portal set 8 2020,2020-21,2020,
29113,transferred rs 3540 for admission guidance on ...,Darshan Suresh Shetty,F,UK,Galaxy portal set 8 2020,2020-21,2020,
29114,XTH 74%/XII-72% BBA LAST YEAR COMPLETED/LOOKIN...,MUHAMMED SAMAN V SIRAJ,M,MUSCAT,Students Reference,2020-21,2020,
29115,WILL FINALISE THE COLLEGE AND CALL BACK ON TO...,MUHAMMED SAMAN V SIRAJ,M,MUSCAT,Students Reference,2020-21,2020,


In [3]:
normalizedRegisteredLeadDf.describe()

Unnamed: 0,remarks,leadName,gender,city,reference,academicYear,englishYear,notes
count,29117,29117,29117,29080,29117,29117,29117,28103
unique,17934,1597,4,738,58,15,5,648
top,Not Attending,Md Mobashirul Islam,M,BANGALORE,SNEHA,2019-20,2019,MBA
freq,1424,89,22438,1123,4113,11709,15578,3712


In [4]:
normalizedRegisteredLeadDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29117 entries, 0 to 29116
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   remarks       29117 non-null  object
 1   leadName      29117 non-null  object
 2   gender        29117 non-null  object
 3   city          29080 non-null  object
 4   reference     29117 non-null  object
 5   academicYear  29117 non-null  object
 6   englishYear   29117 non-null  object
 7   notes         28103 non-null  object
dtypes: object(8)
memory usage: 1.8+ MB


In [5]:
normalizedRegisteredLeadDf['academicYear'].value_counts()

2019-20         11709
2018-19          9035
2020-21          4229
2017-18          1392
2016-17           806
2021-22           516
2019-2021         500
2019-2020         345
2019              334
2020 - 2021        91
2018-2019          73
2020-2021          42
counselling        20
2017-2018          14
2020-2022          11
Name: academicYear, dtype: int64

In [6]:
normalizedRegisteredLeadDf['englishYear'].value_counts()

2019    15578
2018     5979
2020     4894
2017     2073
2021      593
Name: englishYear, dtype: int64

     We can see lot of duplicate year mentioned. We can modify them and add them to one category. 

In [7]:
normalizedRegisteredLeadDf['gender'].value_counts()

M    22438
F     6466
       167
m       46
Name: gender, dtype: int64

    167 rows of data has null string. We can either imputate or drop these rows.

In [8]:
normalizedRegisteredLeadDf['city'].value_counts()

BANGALORE          1123
Bangalore           999
Bangalore           769
KOTTAYAM            620
bangalore           553
                   ... 
Bhatkal               1
PALA                  1
Orissa                1
jammu & kashmir       1
s                     1
Name: city, Length: 738, dtype: int64

In [9]:
normalizedRegisteredLeadDf['notes'].value_counts()

MBA                                            3712
MBBS                                           3017
                                               1818
BBA                                            1140
BAMS                                            595
                                               ... 
MBA   jain                                        2
b.e cs   t.john                                   1
mba finance   cms jain                            1
BPT   rajarajeswari medical medical college       1
BTECH in CS                                       1
Name: notes, Length: 648, dtype: int64

In [10]:
normalizedRegisteredLeadDf['reference'].value_counts()

SNEHA                           4113
Website General Enquiry         3995
WebsiteChat                     3921
Incoming call                   3656
Online                          2720
Students Reference              2400
Galaxy portal set 7             1172
Galaxy Portal 2019 Set 1        1058
Gladiator                        613
Galaxy portal set 1              495
Galaxy portal 2019 set 2         409
KEAM DATA                        402
WhatsApp Chat                    380
Mission 35                       316
Galaxy portal set 8 2020         296
Galaxy portal set 3              276
Hareesh                          269
Gautham Associates               218
Galaxy portal set 3 2019         201
KEA 2018                         183
Direct WalkIn                    153
Galaxy portal set 2              119
college batch                    114
galaxy portal set 5              114
GM WEBSITE ENQUIRY               109
No Reference                     109
MAT DECEMBER 2018                 95
G

In [11]:
#Testing Data
exportPath = "testing_junk_leads.json"
theWholeDataDf = "junk_leads.json"


with open(theWholeDataDf,'r',encoding='utf-8') as f1:
    data_fromFile = f1.read()
    data_fromFile = '[' + data_fromFile.replace('\n', ',\n') + ']'

with open(exportPath,'w',encoding='utf-8') as f2:
    f2.write(data_fromFile)
    
with open(exportPath,'r',encoding='utf-8') as f3:
    jsonObject = json.load(f3)
    unregisteredLeadDf = pd.DataFrame(jsonObject, columns=['leadName','gender','city','academicYear','englishYear'])
    normalize = pd.json_normalize(jsonObject,record_path = 'followUpLog',meta=['leadName','dob','gender','city','reference','notes','academicYear','englishYear'],errors = 'ignore')
                
    normalizedUnRegisteredLeadDf = normalize.drop(columns = ['contactNumberMail','nextFollowUp','dob','user','contactLogResponse','contactTime.$numberInt','nextFollowUp.$numberInt',
                                    'successful.$numberInt','successful','callDuration'])
    
    


# Data Preparation

In [12]:
# convert the entire data to lower case:

normalizedRegisteredLeadDf = normalizedRegisteredLeadDf.apply(lambda x: x.astype(str).str.lower())
normalizedUnRegisteredLeadDf = normalizedUnRegisteredLeadDf.apply(lambda x: x.astype(str).str.lower())


In [13]:
#Data Cleaning : 

#1. Acadmeic Years are arranged properly.
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2017-18' : '2017'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2016-17' : '2016'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2020-21' : '2020'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2018-19' : '2018'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2017-2018' : '2017'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2018-2019' : '2018'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2019-2020' : '2019'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2019-20' : '2019'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2021-22' : '2021'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'counselling' : '2019'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2019-2021' : '2019'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2020-2022' : '2020'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2020-2021' : '2020'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2020 - 2021' : '2020'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'2019 ' : '2019'},regex=True)
normalizedRegisteredLeadDf['academicYear'] = normalizedRegisteredLeadDf['academicYear'].replace(to_replace = {r'201921' : '2019'},regex=True)


In [14]:
# normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2019-20' : '2019'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2018-19' : '2018'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2016-17' : '2016'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2020-21' : '2020'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2020-22' : '2020'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2020 - 2021' : '2020'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2019 -2020' : '2020'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2013' : '2013'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2020-2021' : '2020'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2019' : '2019'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'bcom' : '2020'},regex=True)
normalizedUnRegisteredLeadDf['academicYear'] = normalizedUnRegisteredLeadDf['academicYear'].replace(to_replace = {r'2021-22' : '2021'},regex=True)


In [15]:
normalizedUnRegisteredLeadDf['academicYear'].value_counts()

2019-20    128302
2018        56071
2020        37002
2016          515
2021           68
2013           10
2019            9
Name: academicYear, dtype: int64

# Feature Engineering : 

In [16]:
 #DataSets Marked as negative response : 
negDataset1  = normalizedUnRegisteredLeadDf[normalizedUnRegisteredLeadDf['academicYear'].str.contains("2016")]
negDataset2  = normalizedUnRegisteredLeadDf[normalizedUnRegisteredLeadDf['academicYear'].str.contains("2013")]
negDataset3  = normalizedUnRegisteredLeadDf[normalizedUnRegisteredLeadDf['academicYear'] == '2018']
negframes = [negDataset1,negDataset2,negDataset3]
negDataset = pd.concat(negframes, sort=False)
negDataset

Unnamed: 0,remarks,leadName,gender,city,reference,notes,academicYear,englishYear
141,switch off,vinayak sharma,,,nmat -2017,,2016,2019
142,switch off,vinayak sharma,,,nmat -2017,,2016,2019
143,switch off,vinayak sharma,,,nmat -2017,,2016,2019
144,not pick,vinayak sharma,,,nmat -2017,,2016,2019
145,switch off,vinayak sharma,,,nmat -2017,,2016,2019
...,...,...,...,...,...,...,...,...
221955,not attending//mail/mailer-2,thejesh reddy m,,,website general enquiry,btech,2018,2019
221956,not attending,thejesh reddy m,,,website general enquiry,btech,2018,2019
221957,joined in ap colleges,thejesh reddy m,,,website general enquiry,btech,2018,2019
221958,not attending,thejesh reddy m,,,website general enquiry,btech,2018,2019


I have taken only leads from previous years, in order to balance the data.

In [17]:
#Grouping remarks column for each candidate and creating a training dataset for both positive and negative datasets.
#Positive dataset : Response = 1 as in they joined the organisation
posCallResponse = normalizedRegisteredLeadDf.groupby('leadName')['remarks'].apply(list).reset_index(name='remarks')
posCallResponse['remarks'] = [','.join(map(str, each)) for each in posCallResponse['remarks']]
posCallResponse['response'] = 1
posCallResponse

Unnamed: 0,leadName,remarks,response
0,\takanksha singh,"not attending/ sent greeting mails,no mail id/...",1
1,\takhil c suresh,transferred rs 10000for registration on may 30...,1
2,\tanisha annu rary,mat-700.50/ degree- 80 above/ discussinga abou...,1
3,\tarmaan mukesh,i spoke to arman candidate dad so he is lookin...,1
4,\tashiq g mithra,waiting for suppli exam given details /transfe...,1
...,...,...,...
1591,shrinidhi katti,he is doing his bba 5th semister.told his the ...,1
1592,steffy sebastian,i spoke to her mom and gave all details. as ca...,1
1593,sushmitha s,"applied for st.joseph, waiting for the result....",1
1594,swamini suryakant gabhane,maharastra 10-94.20 12-74. neet-308 kea reg do...,1


In [19]:
#Negative Dataset : Response = 0 as in they refused to join
negCallResponse = negDataset.groupby('leadName')['remarks'].apply(list).reset_index(name='remarks')
#print(negCallResponse)
negCallResponse['remarks'] = [','.join(map(str, each)) for each in negCallResponse['remarks']]
negCallResponse['response'] = 0    
#negCallResponse.index
negCallResponse

Unnamed: 0,leadName,remarks,response
0,\t\tanupam narayana,"rnr,i directly askd want adm or not but told o...",0
1,\t\tgelly nayak,"rnr,told lil busy n disconnected ,rnr,told not...",0
2,\t\tindranil sen,"b tech cs, 10th in between disconnected ,sent...",0
3,\t\tsomyadip mukharjee,"took adm in kolkata,still i asked do u want in...",0
4,\taakash,"num is not valid..,num is not valid..,num is n...",0
...,...,...,...
4702,vishnu.v,"repeated student/contact me,planning to repeat...",0
4703,zoramliana,"so sent mail and whats app message ,switch off...",0
4704,ńîkħíł móħáńtÿ,m busy will call u once free n disconnected th...,0
4705,ʍօհժ ƙɑíƒ,looking for admission next year,0


In [20]:
#We shall split the training data into 3 parts :  With positive values being 1596 and negative values being 1569 in each training dataset.
negTrainingData1 = pd.DataFrame(negCallResponse,index = range(0,1569))
negTrainingData2 = pd.DataFrame(negCallResponse,index = range(1569,3138))
negTrainingData3 = pd.DataFrame(negCallResponse,index = range(3138,4707))

In [21]:
#Training Data1 
data1 = [posCallResponse,negTrainingData1]
TrainingData1 = pd.concat(data1, sort=False)
#TrainingData1 = TrainingData1[['remarks','response']]

#Training Data2
data2 = [posCallResponse,negTrainingData2]
TrainingData2 = pd.concat(data1, sort=False)


#Training Data3
data3 = [posCallResponse,negTrainingData3]
TrainingData3 = pd.concat(data1, sort=False)



# Preprocessing the Training Data

In [22]:
def process_data(dataset):
    callLogs = dataset['remarks']
    #use regular expressions to replace email addresses, URLs, phone numbers, other numbers

    # Replace email addresses with 'email'
    processed = callLogs.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','emailaddress')

    # Replace URLs with 'webaddress'
    processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$',
                                  'webaddress')

    # Replace money symbols with 'moneysymb' (£ can by typed with ALT key + 156)
    processed = processed.str.replace(r'£|\$', 'moneysymb')
    
    # Replace 10 digit phone numbers (formats include paranthesis, spaces, no spaces, dashes) with 'phonenumber'
    processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$',
                                  'phonenumber')
    #Replace percentage:
    processed = processed.str.replace(r'^[0-9]*\%', 'percentage')

    # Remove punctuation
    processed = processed.str.replace(r'[^\w\d\s]', ' ')

    # Replace whitespace between terms with a single space
    processed = processed.str.replace(r'\s+', ' ')

    # Remove leading and trailing whitespace
    processed = processed.str.replace(r'^\s+|\s+?$', '')
    
    return processed

processedTrainingData1 = process_data(TrainingData1)
processedTrainingData2 = process_data(TrainingData2)
processedTrainingData3 = process_data(TrainingData3)



In [24]:
def clean_data(processedTrainingData):
    corpus = []
    for i in range(len(processedTrainingData)):
        review = processedTrainingData.iloc[i]
        review = review.split()
        ps = PorterStemmer()
        all_stopwords = stopwords.words('english')
        all_stopwords.remove('not')
        review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
        review = ' '.join(review)
        corpus.append(review)
    return corpus
cleanedTrainingData1 = clean_data(processedTrainingData1)
cleanedTrainingData2 = clean_data(processedTrainingData2)
cleanedTrainingData3 = clean_data(processedTrainingData3)



# Bag of Words Model


In [25]:
def bag_of_words_model(corpus,dataset):
    cv = CountVectorizer(max_features=1500)
    X = cv.fit_transform(corpus).toarray()
    y = dataset.iloc[:, -1].values
    return X,y

logs1,classes1 = bag_of_words_model(cleanedTrainingData1,TrainingData1)
logs2,classes2 = bag_of_words_model(cleanedTrainingData2,TrainingData2)
logs3,classes3 = bag_of_words_model(cleanedTrainingData3,TrainingData3)



In [26]:
def split_the_dataset(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 96)
    return X_train, X_test, y_train, y_test
X_train1, X_test1, y_train1, y_test1 = split_the_dataset(logs1,classes1)
X_train2, X_test2, y_train2, y_test2 = split_the_dataset(logs2,classes2)
X_train3, X_test3, y_train3, y_test3 = split_the_dataset(logs3,classes3)


In [27]:
def calBestRandomStateOf(model,X,y):
    max_score=0
    for i in range(40,100):
        x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=.20,random_state=i)
        model.fit(x_train,y_train)
        pred = model.predict(x_test)
        score = accuracy_score(y_test,pred)
        if score>max_score:
            max_score = score
            final_state = i

    return final_state

In [28]:
from sklearn.model_selection import cross_val_score
def build_models(X_train,X_test,y_train,y_test):
    # Define models to train
    names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

    classifiers = [
        KNeighborsClassifier(),
        DecisionTreeClassifier(),
        RandomForestClassifier(),
        LogisticRegression(),
        SGDClassifier(),
        MultinomialNB(),
        SVC()
    ]

    models = zip(names, classifiers)

    for name, model in models:
        classifier = model
        training_data = classifier.fit(X_train,y_train)
        y_pred = training_data.predict(X_test)
        print("Classifier :", name)
        print("-----------------------------------------------------")
        confusionmatrix = confusion_matrix(y_test, y_pred)
        print(" Confusion Matrix :")
        print(confusionmatrix)
        accuracy = accuracy_score(y_test, y_pred)
        print("Accuracy : ",accuracy*100)
        classificationreport = classification_report(y_test, y_pred)
        print("Classification Report : ")
        print(classificationreport)
        print("--------------------------------------------------------")
        
    
        cv_score = cross_val_score(classifier,X_train,y_train,cv=5,scoring='accuracy')
        print("*************************************************************************************")
        print("Cross Validatiob Score for ",classifier," : ")
        print("Score : ", cv_score)
        print("Mean : ", cv_score.mean())
        print("Standard Deviation : ", cv_score.std())
        print("*************************************************************************************")
        print("")
              

In [29]:
build_models(X_train1, X_test1, y_train1, y_test1)

Classifier : K Nearest Neighbors
-----------------------------------------------------
 Confusion Matrix :
[[297  14]
 [125 197]]
Accuracy :  78.04107424960506
Classification Report : 
              precision    recall  f1-score   support

           0       0.70      0.95      0.81       311
           1       0.93      0.61      0.74       322

    accuracy                           0.78       633
   macro avg       0.82      0.78      0.77       633
weighted avg       0.82      0.78      0.77       633

--------------------------------------------------------
*************************************************************************************
Cross Validatiob Score for  KNeighborsClassifier()  : 
Score :  [0.78500986 0.80473373 0.78656126 0.8201581  0.79249012]
Mean :  0.7977906151819195
Standard Deviation :  0.013160946327730887
*************************************************************************************

Classifier : Decision Tree
----------------------------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Classifier : Logistic Regression
-----------------------------------------------------
 Confusion Matrix :
[[291  20]
 [ 36 286]]
Accuracy :  91.15323854660348
Classification Report : 
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       311
           1       0.93      0.89      0.91       322

    accuracy                           0.91       633
   macro avg       0.91      0.91      0.91       633
weighted avg       0.91      0.91      0.91       633

--------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

*************************************************************************************
Cross Validatiob Score for  LogisticRegression()  : 
Score :  [0.89940828 0.92504931 0.91699605 0.9229249  0.92094862]
Mean :  0.9170654317811507
Standard Deviation :  0.009218833909167872
*************************************************************************************

Classifier : SGD Classifier
-----------------------------------------------------
 Confusion Matrix :
[[265  46]
 [ 28 294]]
Accuracy :  88.30963665086888
Classification Report : 
              precision    recall  f1-score   support

           0       0.90      0.85      0.88       311
           1       0.86      0.91      0.89       322

    accuracy                           0.88       633
   macro avg       0.88      0.88      0.88       633
weighted avg       0.88      0.88      0.88       633

--------------------------------------------------------
**************************************************************************

In [30]:
build_models(X_train2, X_test2, y_train2, y_test2)

Classifier : K Nearest Neighbors
-----------------------------------------------------
 Confusion Matrix :
[[297  14]
 [125 197]]
Accuracy :  78.04107424960506
Classification Report : 
              precision    recall  f1-score   support

           0       0.70      0.95      0.81       311
           1       0.93      0.61      0.74       322

    accuracy                           0.78       633
   macro avg       0.82      0.78      0.77       633
weighted avg       0.82      0.78      0.77       633

--------------------------------------------------------
*************************************************************************************
Cross Validatiob Score for  KNeighborsClassifier()  : 
Score :  [0.78500986 0.80473373 0.78656126 0.8201581  0.79249012]
Mean :  0.7977906151819195
Standard Deviation :  0.013160946327730887
*************************************************************************************

Classifier : Decision Tree
----------------------------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Classifier : Logistic Regression
-----------------------------------------------------
 Confusion Matrix :
[[291  20]
 [ 36 286]]
Accuracy :  91.15323854660348
Classification Report : 
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       311
           1       0.93      0.89      0.91       322

    accuracy                           0.91       633
   macro avg       0.91      0.91      0.91       633
weighted avg       0.91      0.91      0.91       633

--------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

*************************************************************************************
Cross Validatiob Score for  LogisticRegression()  : 
Score :  [0.89940828 0.92504931 0.91699605 0.9229249  0.92094862]
Mean :  0.9170654317811507
Standard Deviation :  0.009218833909167872
*************************************************************************************

Classifier : SGD Classifier
-----------------------------------------------------
 Confusion Matrix :
[[285  26]
 [ 42 280]]
Accuracy :  89.25750394944708
Classification Report : 
              precision    recall  f1-score   support

           0       0.87      0.92      0.89       311
           1       0.92      0.87      0.89       322

    accuracy                           0.89       633
   macro avg       0.89      0.89      0.89       633
weighted avg       0.89      0.89      0.89       633

--------------------------------------------------------
**************************************************************************

In [31]:
build_models(X_train3, X_test3, y_train3, y_test3)

Classifier : K Nearest Neighbors
-----------------------------------------------------
 Confusion Matrix :
[[297  14]
 [125 197]]
Accuracy :  78.04107424960506
Classification Report : 
              precision    recall  f1-score   support

           0       0.70      0.95      0.81       311
           1       0.93      0.61      0.74       322

    accuracy                           0.78       633
   macro avg       0.82      0.78      0.77       633
weighted avg       0.82      0.78      0.77       633

--------------------------------------------------------
*************************************************************************************
Cross Validatiob Score for  KNeighborsClassifier()  : 
Score :  [0.78500986 0.80473373 0.78656126 0.8201581  0.79249012]
Mean :  0.7977906151819195
Standard Deviation :  0.013160946327730887
*************************************************************************************

Classifier : Decision Tree
----------------------------------------

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Classifier : Logistic Regression
-----------------------------------------------------
 Confusion Matrix :
[[291  20]
 [ 36 286]]
Accuracy :  91.15323854660348
Classification Report : 
              precision    recall  f1-score   support

           0       0.89      0.94      0.91       311
           1       0.93      0.89      0.91       322

    accuracy                           0.91       633
   macro avg       0.91      0.91      0.91       633
weighted avg       0.91      0.91      0.91       633

--------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

*************************************************************************************
Cross Validatiob Score for  LogisticRegression()  : 
Score :  [0.89940828 0.92504931 0.91699605 0.9229249  0.92094862]
Mean :  0.9170654317811507
Standard Deviation :  0.009218833909167872
*************************************************************************************

Classifier : SGD Classifier
-----------------------------------------------------
 Confusion Matrix :
[[288  23]
 [ 40 282]]
Accuracy :  90.04739336492891
Classification Report : 
              precision    recall  f1-score   support

           0       0.88      0.93      0.90       311
           1       0.92      0.88      0.90       322

    accuracy                           0.90       633
   macro avg       0.90      0.90      0.90       633
weighted avg       0.90      0.90      0.90       633

--------------------------------------------------------
**************************************************************************

Observations : 
    1. From the above results, we can see that Random forest algorithm performs better than the rest of the model on second dataset.
    2. Random Forest Algorithm is giving out results with 93.13%

In [32]:
#Final Model :

from sklearn.model_selection import GridSearchCV

  
random_search = {'criterion': ['entropy', 'gini'],
               'max_features': ['auto', 'sqrt','log2', None],
               'min_samples_leaf': [4, 6, 8, 12],
               'min_samples_split': [5, 7, 10, 14]
                }

gridsearch = GridSearchCV(RandomForestClassifier(),random_search,n_jobs=-1,pre_dispatch=2)
gridsearch.fit(logs2,classes2)
gridsearch.best_params_

KeyboardInterrupt: 

# Predicting model :

In [None]:
predicting_data = normalizedUnRegisteredLeadDf.groupby('leadName')['remarks'].apply(list).reset_index(name='remarks')
predicting_data['remarks'] = [','.join(map(str, each)) for each in predicting_data['remarks']]    


In [None]:
process = process_data(predicting_data)
clean = clean_data(process)
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(clean).toarray()



In [None]:
classifier = LogisticRegression(max_iter = 200)
fitting = classifier.fit(X_train2,y_train2)
final_pred = fitting.predict(X)


In [None]:
output = pd.DataFrame(final_pred)
output.shape
predicting_data['predictedValue'] = output
final_output = predicting_data[['leadName','predictedValue']]

final_output['leadName'] = final_output['leadName'].replace(to_replace = {r'^\t*' : ''},regex=True)


In [None]:
final = unregisteredLeadDf.merge(final_output, on='leadName',how='right')
final[final['predictedValue'] == 1][:10]
final.to_excel(r'C:\\Users\sbhatt\Documents\Python Scripts\Datasets\Lead Analysis\\leads.xlsx', index = False)
