# Make Mapping - CFS

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn import preprocessing 
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import svm

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
df= pd.read_csv('/Users/rajathadri_as/Documents/CFSmapping.csv', names=['Make','ModelIn','ModelOut','Count'])

In [5]:
df.head()

Unnamed: 0,Make,ModelIn,ModelOut,Count
0,Xplore,Activ,Activ,12
1,Etrusco,T,T,9
2,Roller Team,Auto-Roller,Auto-Roller,5
3,Benimar,Tessoro,Tessoro,4
4,Chausson,Etape Line 650,Etape Line 650,4


In [6]:
df.shape

(4926, 4)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4926 entries, 0 to 4925
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Make      4926 non-null   object
 1   ModelIn   4923 non-null   object
 2   ModelOut  4926 non-null   object
 3   Count     4926 non-null   int64 
dtypes: int64(1), object(3)
memory usage: 154.1+ KB


In [8]:
df['Make'].nunique()

174

In [9]:
df['ModelIn'].nunique()

4903

In [10]:
df['ModelOut'].nunique()

1954

<div class='alert alert-info'>Basically, we need to map 4903 Model inputs to 1954 Model outputs</div>

In [14]:
df.isna().sum()

Make        0
ModelIn     3
ModelOut    0
Count       0
dtype: int64

In [11]:
df[df['ModelIn'].isna()==True]

Unnamed: 0,Make,ModelIn,ModelOut,Count
474,Auto-Sleepers,,Unknown,2
1395,Coachman,,VIP 565/4,1
3001,Saly,,Calypso,2


In [20]:
df[df['Make']=='Coachman']

Unnamed: 0,Make,ModelIn,ModelOut,Count
1306,Coachman,580,Acadia 580,1
1307,Coachman,2021 Coachman Acadia 545 for sale,Acadia 545,1
1308,Coachman,520 Acadia,Acadia 520,1
1309,Coachman,520/3 VIP,VIP 520/3,1
1310,Coachman,520/4 Ashington,Ashington 520/4,1
1311,Coachman,545 Kimberly,VIP 545,1
1312,Coachman,565 Kimberley caravan - 2022 – Single Axle 4 ...,VIP 565,1
1313,Coachman,Acadia,Acadia,13
1314,Coachman,Acadia 460,Acadia 460,2
1315,Coachman,Acadia 545,Acadia 545,3


<div class = 'alert alert-info'>Need info on how 'Coachman' with no Model input is mapped to 'VIP 565/4'</div>

In [22]:
df['ModelIn'].fillna('Unknown', inplace=True)

In [23]:
df.isna().sum()

Make        0
ModelIn     0
ModelOut    0
Count       0
dtype: int64

## Feature Engineering

In [76]:
df['input'] = df['Make']+', '+df['ModelIn']

In [78]:
df['output']=df['ModelOut']

In [79]:
df.head()

Unnamed: 0,Make,ModelIn,ModelOut,Count,input,output
0,Xplore,Activ,Activ,12,"Xplore, Activ",Activ
1,Etrusco,T,T,9,"Etrusco, T",T
2,Roller Team,Auto-Roller,Auto-Roller,5,"Roller Team, Auto-Roller",Auto-Roller
3,Benimar,Tessoro,Tessoro,4,"Benimar, Tessoro",Tessoro
4,Chausson,Etape Line 650,Etape Line 650,4,"Chausson, Etape Line 650",Etape Line 650


## ML Classifier

In [229]:
df.head()

Unnamed: 0,Make,ModelIn,ModelOut,Count,input,output
0,Xplore,Activ,Activ,12,"Xplore, Activ",Activ
1,Etrusco,T,T,9,"Etrusco, T",T
2,Roller Team,Auto-Roller,Auto-Roller,5,"Roller Team, Auto-Roller",Auto-Roller
3,Benimar,Tessoro,Tessoro,4,"Benimar, Tessoro",Tessoro
4,Chausson,Etape Line 650,Etape Line 650,4,"Chausson, Etape Line 650",Etape Line 650


In [241]:
X = df['ModelIn']
y = df['output']

In [242]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

In [243]:
X_train.shape

(3940,)

In [244]:
X_test.shape

(986,)

In [245]:
y_train.shape

(3940,)

In [246]:
y_test.shape

(986,)

In [236]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [237]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [247]:
pipe_lr = Pipeline(steps=[('cv',TfidfVectorizer(analyzer='word',stop_words= 'english')),
                          ('lr_gCV',GridSearchCV(LogisticRegression(), {'C': [0.001, 0.01, 0.1, 1, 10, 100, 200]}, cv=5))]
                  )

In [255]:
pipe_lr.fit(X, y)

In [256]:
pipe_lr.score(X,y)

0.9796995533901746

In [260]:
# pipe_lr.score(X_test,y_test)

In [257]:
pred1 = 'SANDHURST - TURNBERRY HOLIDAY PARK - SEA VIEWS - LOW SITE FEES'

In [258]:
pipe_lr.predict([pred1])

array(['Sandhurst'], dtype=object)

In [259]:
# Save the model to a file using pickle
filename = 'CFS_mapper.pkl'
with open(filename, 'wb') as file:
    pickle.dump(pipe_lr, file)

In [None]:
file.close()

# <div class = 'alert alert-success'> Pickle and predict </div>

In [12]:
import pickle

In [13]:
with open('CFS_mapper.pkl', 'rb') as file:
    mod = pickle.load(file)

In [14]:
mod

In [26]:
X_new = ["SANDHURST - TURNBERRY HOLIDAY PARK - SEA VIEWS - LOW SITE FEES"]

In [27]:
mod.predict(X_new)

array(['Sandhurst'], dtype=object)

In [23]:
file.close()

# <div class = 'alert alert-warning'>Other ML models</div>

In [195]:
X = df['ModelIn']
y = df['output']

In [196]:
X.shape

(4926,)

In [197]:
y.shape

(4926,)

In [167]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [198]:
X_train.shape

(4433,)

In [199]:
X_test.shape

(493,)

In [200]:
y_train.shape

(4433,)

In [201]:
y_test.shape

(493,)

In [172]:
count_vect = CountVectorizer()
X_train_vect = count_vect.fit_transform(X_train)
X_train_vect.shape

(4433, 3447)

In [173]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_vect)
X_train_tfidf.shape

(4433, 3447)

## Naive Bayes

In [177]:
# Machine Learning
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [178]:
# Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows:
# The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later.
# We will be using the 'text_clf' going forward.

from sklearn.pipeline import Pipeline

text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)

In [179]:
text_clf.score(X_train, y_train)

0.19039025490638395

In [180]:
text_clf.score(X_test, y_test)

0.1643002028397566

## SVM

In [202]:
# Training Support Vector Machines - SVM and calculating its performance

from sklearn.linear_model import SGDClassifier
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(X, y)



In [175]:
predicted_svm = text_clf_svm.predict(X_test)
np.mean(predicted_svm == y_test)

0.5415821501014199

In [203]:
text_clf_svm.score(X,y)

0.8765732846122615

In [204]:
pred1 = 'Willerby, SANDHURST - TURNBERRY HOLIDAY PARK - SEA VIEWS - LOW SITE FEES'

In [205]:
text_clf_svm.predict([pred1])

array(['Sailsbury'], dtype='<U45')

## GridSearch CV

In [136]:
# Grid Search
# Here, we are creating a list of parameters for which we would like to do performance tuning. 
# All the parameters name start with the classifier name (remember the arbitrary name we gave). 
# E.g. vect__ngram_range; here we are telling to use unigram and bigrams and choose the one which is optimal.

from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

In [130]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)



In [131]:
gs_clf.best_score_

0.5107237963021392

In [132]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [207]:
pipe_svm = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
            ('clf', svm.SVC(random_state=42))])

In [212]:
param_range = [9, 10]
jobs = -1

In [213]:
grid_params_svm = [{'clf__kernel': ['linear', 'rbf'], 
        'clf__C': param_range}]

In [214]:
SVM = GridSearchCV(estimator=pipe_svm,
            param_grid=grid_params_svm,
            scoring='accuracy',
            cv=10,
            n_jobs=jobs)

In [224]:
SVM.fit(X,y)



In [225]:
SVM.best_score_

0.5211056415837992

In [226]:
SVM.best_params_

{'clf__C': 10, 'clf__kernel': 'linear'}

In [227]:
pred1 = 'Sheraton2 bedrooms 42 x 14 feet'

In [228]:
SVM.predict([pred1])

array(['Sheraton'], dtype=object)

In [137]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_test,y_test)



In [138]:
gs_clf.best_score_

0.4228721942281264

In [139]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}