## Statistics coursework: 
### Product Categorisation: 
#### Data preprocessing, feature extraction, classifier selection, evaluation and future predictions

### Import Libraries

In [1]:
# Load libraries
import nltk
import pandas as pd
import numpy as np
import sklearn
from pandas import read_csv
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
import pickle

### Load and explore the data (4 marks)

In [2]:
filename = 'product_category_dataset_new.csv'
data = pd.read_csv(filename, encoding="utf-8")

In [3]:
dataset = pd.DataFrame(data, columns = ['Description',
                                  'Level_1','Level_2','Level_3'])
dataset

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918
...,...,...,...,...
10634,term 10 issu on year subscriptionyo sav 75 cov...,90A8B052,C719A,1BE5
10635,term 12 issu on year subscriptionyo sav 86 cov...,90A8B052,C719A,F45B
10636,term 9 issu on year subscriptionyo sav 64 cov ...,90A8B052,C719A,A0E2
10637,term 26 issu on year subscriptionyo sav 54 cov...,90A8B052,C719A,1BE5


In [4]:
# print the dataset shape
dataset.shape

(10639, 4)

In [5]:
# print first 10 rows with .head()
dataset.head(10)

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918
5,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
6,mit warm protect real stay dainty littl hand p...,09BF5150,C7E19,D06E
7,fal back cozy bas toughskin inf toddl girl mic...,2CEC27F1,ADAD6,3918
8,ev smal lumberjack nee cozy look cool newborn ...,2CEC27F1,ADAD6,98CF
9,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,ED0D


In [6]:
# Information about the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10639 entries, 0 to 10638
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Description  10627 non-null  object
 1   Level_1      10639 non-null  object
 2   Level_2      10639 non-null  object
 3   Level_3      10639 non-null  object
dtypes: object(4)
memory usage: 332.6+ KB


In [7]:
# Describe the dataset - counts, unique values, frequency, 
print(dataset.describe())

                                              Description   Level_1 Level_2  \
count                                               10627     10639   10639   
unique                                               9668        15      36   
top     glory gorg col fing complet outfit express moo...  B092BA29   2D5A3   
freq                                                   24       900     797   

       Level_3  
count    10639  
unique      94  
top       28A7  
freq       332  


### Deal with Missing Data (4 marks)

In [8]:
# Check if data has missing values in the Description column
dataset['Description'].isnull().sum()

12

In [9]:
print(dataset['Description'].sort_values(ascending=False).tail(13))

9290    0 12 oz bareskin perfect veil light med min fl...
1063                                                  NaN
3434                                                  NaN
3458                                                  NaN
7754                                                  NaN
7788                                                  NaN
7796                                                  NaN
7808                                                  NaN
7859                                                  NaN
7936                                                  NaN
7962                                                  NaN
7988                                                  NaN
8004                                                  NaN
Name: Description, dtype: object


In [10]:
# Deal with missing values
dataset_new = dataset.dropna(how='any', subset=['Description'])
dataset_new

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918
...,...,...,...,...
10634,term 10 issu on year subscriptionyo sav 75 cov...,90A8B052,C719A,1BE5
10635,term 12 issu on year subscriptionyo sav 86 cov...,90A8B052,C719A,F45B
10636,term 9 issu on year subscriptionyo sav 64 cov ...,90A8B052,C719A,A0E2
10637,term 26 issu on year subscriptionyo sav 54 cov...,90A8B052,C719A,1BE5


In [11]:
dataset_new['Description'].isnull().sum()

0

### Drop Classes where the number of instances is < 10 (4 marks)

In [12]:
# There are no value counts of less than 10 in Level_1
dataset_new['Level_1'].value_counts()

B092BA29    900
AAC8EE56    890
35E04739    890
57164AC1    875
2CEC27F1    858
EFEF723B    800
09BF5150    798
69286F45    797
96F95EEC    587
3E1E0D78    579
4C3D8686    574
4513C920    558
014303D1    511
90A8B052    506
D410C91A    504
Name: Level_1, dtype: int64

In [13]:
# Apply to Level_1 
# no need to apply to Level_1

In [14]:
# There are 3 instances of value counts of less than 10 in Level_2 
dataset_new['Level_2'].value_counts().loc[lambda x:x<10]

Series([], Name: Level_2, dtype: int64)

In [15]:
# Apply to Level_2
dataset_new.drop(dataset_new['Level_2'].value_counts().loc[lambda x:x<10])

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918
...,...,...,...,...
10634,term 10 issu on year subscriptionyo sav 75 cov...,90A8B052,C719A,1BE5
10635,term 12 issu on year subscriptionyo sav 86 cov...,90A8B052,C719A,F45B
10636,term 9 issu on year subscriptionyo sav 64 cov ...,90A8B052,C719A,A0E2
10637,term 26 issu on year subscriptionyo sav 54 cov...,90A8B052,C719A,1BE5


In [16]:
# There are 5 instances of value counts of less than 10 in Level_3 
dataset_new['Level_3'].value_counts().loc[lambda x:x<10]

Series([], Name: Level_3, dtype: int64)

In [17]:
# Apply to Level_3
dataset_new.drop(dataset_new['Level_3'].value_counts().loc[lambda x:x<10])

Unnamed: 0,Description,Level_1,Level_2,Level_3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918
...,...,...,...,...
10634,term 10 issu on year subscriptionyo sav 75 cov...,90A8B052,C719A,1BE5
10635,term 12 issu on year subscriptionyo sav 86 cov...,90A8B052,C719A,F45B
10636,term 9 issu on year subscriptionyo sav 64 cov ...,90A8B052,C719A,A0E2
10637,term 26 issu on year subscriptionyo sav 54 cov...,90A8B052,C719A,1BE5


In [18]:
df_clean = dataset_new.copy()
df_clean['Description'] = df_clean['Description'].astype('str')
df_clean['Description'] = df_clean['Description'].str.replace(',', '')

In [19]:
df_clean.dtypes

Description    object
Level_1        object
Level_2        object
Level_3        object
dtype: object

In [20]:
df_clean.Level_1.unique()

array(['09BF5150', '2CEC27F1', 'AAC8EE56', '4C3D8686', '69286F45',
       '57164AC1', '4513C920', '35E04739', 'EFEF723B', '96F95EEC',
       '014303D1', '90A8B052', 'B092BA29', '3E1E0D78', 'D410C91A'],
      dtype=object)

In [21]:
df_clean.Level_2.unique()

array(['C7E19', 'ADAD6', '914A1', '74974', '2D5A3', '9B69F', '7B638',
       'F4055', '0864A', 'F824F', 'B2DB4', '02FA0', 'D5531', 'CB803',
       'BAE8A', '31FED', 'E69F5', '390F1', '94728', '36080', '77F62',
       'A04D3', '7AED7', '915D4', '6C6B1', '5E038', '262E7', 'AF6B9',
       'C719A', '375FE', '5A8AB', '08960', '9D9EE', 'E6162', 'ACD06',
       '223B2'], dtype=object)

In [22]:
df_clean.Level_3.unique()

array(['FDCF', 'ED0D', 'D06E', '98CF', '3918', '2212', '224F', 'D97D',
       'A2B2', 'C267', 'F72B', '62E8', 'DAEA', '1530', '28A7', '1058',
       '0CB9', '4A72', 'D579', '5DE2', '80C4', 'DDDE', '0F8B', '2C26',
       '1F75', '5B02', '96B8', '7288', '21DA', 'AE8B', 'D436', '02B3',
       '473E', '078B', '6253', '627D', 'FA9E', '2C15', '5BE9', 'CCEE',
       '2ABA', '3E82', '20C0', '3DD3', 'B183', '7C00', '98A8', '44C3',
       '215F', 'DDD5', '8C88', '6856', '695D', '8B36', 'BB6B', '5912',
       '5F7C', '0EBC', 'C563', '5AE1', 'E8E0', '0ED4', 'C5B4', '3E60',
       'BBA5', '6539', '16BD', 'A2FA', '3AAD', '6BE5', '29B3', 'A104',
       'A0E2', '1BE5', 'F45B', '6B02', 'A675', '1F61', 'CD31', 'AA6B',
       '2CFE', '1000', '05A0', 'E3E0', '818C', '2E14', '0B35', '33D1',
       'F0EF', '9203', 'F213', 'D55B', '74C9', '8FEF'], dtype=object)

### Now let's write a Function to Prepare Text (4 marks)
We will apply it to our DataFrame later on

* This function receives a text string and performs the following:
* Convert text to lower case
* Remove punctuation marks
* Apply stemming using the popular Snowball or Porter Stemmer (optional)
* Apply NGram Tokenisation
* Return the tokenised text as a list of strings

In [23]:
from nltk.tokenize import word_tokenize
import re
from nltk import ngrams
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk.stem.snowball import SnowballStemmer
st = SnowballStemmer(language='english')

def process_text(text,n =1):
    text = str(text)
    # Replace all characters that are non-alphanumeric with spaces
    tokinizer = RegexpTokenizer(r'\w+')
    text = text = re.sub(r'[^\w\s]','', text)
    text = text = re.sub(r'_', '', text)
    # Convert text to lowercase
    text = text.lower()
   
    # Apply ngram
    tokens = ngrams (nltk.word_tokenize(text),3)
    tokenize = [ ' '.join(grams) for grams in tokens]
    
    return tokenize 

In [24]:
# Here is an example function call
process_text("Here we're testing the process_text function, results are as follows:", 3)

['here were testing',
 'were testing the',
 'testing the processtext',
 'the processtext function',
 'processtext function results',
 'function results are',
 'results are as',
 'are as follows']

In [25]:
# Results should look like this:
['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

['here were test',
 'were test the',
 'test the processtext',
 'the processtext function',
 'processtext function result',
 'function result are',
 'result are as',
 'are as follow']

### Now let's apply TF-IDF to extract features from plain text (10 marks)

In [26]:
# Might take a while...
# Here you apply the process_text function to the Description column of the data
text = df_clean.Description
process_text(text)

['0 gerb cap',
 'gerb cap help',
 'cap help keep',
 'help keep littl',
 'keep littl on',
 'littl on head',
 'on head cov',
 'head cov warm',
 'cov warm day',
 'warm day 1',
 'day 1 newborn',
 '1 newborn inf',
 'newborn inf toddl',
 'inf toddl boy',
 'toddl boy hoody',
 'boy hoody jacket',
 'hoody jacket oshkosh',
 'jacket oshkosh b',
 'oshkosh b g',
 'b g 2',
 'g 2 tut',
 '2 tut ballet',
 'tut ballet anym',
 'ballet anym leap',
 'anym leap foxy',
 'leap foxy fash',
 'foxy fash ruffl',
 'fash ruffl tul',
 'ruffl tul toddl',
 'tul toddl 3',
 'toddl 3 newborn',
 '3 newborn inf',
 'newborn inf toddl',
 'inf toddl boy',
 'toddl boy hoody',
 'boy hoody jacket',
 'hoody jacket oshkosh',
 'jacket oshkosh b',
 'oshkosh b g',
 'b g 4',
 'g 4 easy',
 '4 easy keep',
 'easy keep feel',
 'keep feel warm',
 'feel warm cozy',
 'warm cozy inf',
 'cozy inf toddl',
 'inf toddl girl',
 'toddl girl hoody',
 'girl hoody 10634',
 'hoody 10634 term',
 '10634 term 10',
 'term 10 issu',
 '10 issu on',
 'issu on

In [27]:
# Then you pass the results to the bag of words tranformer
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
vect = CountVectorizer()
vect.fit(text)

CountVectorizer()

In [28]:
vocab = format(len(vect.vocabulary_))

In [29]:
vocab = format(vect.vocabulary_)
vocab



In [30]:
bow = vect.transform(text)

In [31]:
bow

<10627x16502 sparse matrix of type '<class 'numpy.int64'>'
	with 297983 stored elements in Compressed Sparse Row format>

In [32]:
features = vect.get_feature_names()
len(features)

16502

Now we can use .transform on our Bag-of-Words (bow) transformed object and transform the entire DataFrame of text file contents. Let's go ahead and check out how the bag-of-words counts for the entire corpus in a large, sparse matrix:

## Apply to the Entire DataFrame

In [33]:
# After that you pass the result of the previous step to sklearn's TfidfTransformer
# which will convert them into a feature matrix
# See here: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html

In [34]:
# Make a copy of the df_clean DataFrame
df = df_clean.copy()

In [35]:
df.shape

(10627, 4)

In [36]:
# Apply the process_text function to the DataFrame
print(df.iloc[:,0].apply(process_text))

0        [gerb cap help, cap help keep, help keep littl...
1        [newborn inf toddl, inf toddl boy, toddl boy h...
2        [tut ballet anym, ballet anym leap, anym leap ...
3        [newborn inf toddl, inf toddl boy, toddl boy h...
4        [easy keep feel, keep feel warm, feel warm coz...
                               ...                        
10634    [term 10 issu, 10 issu on, issu on year, on ye...
10635    [term 12 issu, 12 issu on, issu on year, on ye...
10636    [term 9 issu, 9 issu on, issu on year, on year...
10637    [term 26 issu, 26 issu on, issu on year, on ye...
10638    [term 12 issu, 12 issu on, issu on year, on ye...
Name: Description, Length: 10627, dtype: object


In [37]:
# Create a 'bag of words' with CountVectorizer() using the process_text function
# fit to the Description column
# print the 'bag of words' vocabularly count from Description column
bow_transformer = CountVectorizer(analyzer = process_text).fit(df['Description'])
bow_transformer.vocabulary_

{'gerb cap help': 95297,
 'cap help keep': 36261,
 'help keep littl': 104677,
 'keep littl on': 117199,
 'littl on head': 127456,
 'on head cov': 149745,
 'head cov warm': 103320,
 'cov warm day': 55180,
 'warm day long': 226432,
 'day long design': 59917,
 'long design coordin': 128247,
 'design coordin gerb': 62387,
 'coordin gerb layet': 53176,
 'gerb layet item': 95303,
 'layet item gre': 121790,
 'item gre item': 114051,
 'gre item gift': 100379,
 'item gift giv': 114049,
 'newborn inf toddl': 145971,
 'inf toddl boy': 111168,
 'toddl boy hoody': 214483,
 'boy hoody jacket': 31216,
 'hoody jacket oshkosh': 107654,
 'jacket oshkosh b': 114700,
 'oshkosh b gosh': 151343,
 'b gosh versatil': 20724,
 'gosh versatil addit': 99431,
 'versatil addit everyday': 223805,
 'addit everyday cas': 13715,
 'everyday cas wardrob': 76148,
 'cas wardrob mad': 38008,
 'wardrob mad comfy': 226186,
 'mad comfy cotton': 131966,
 'comfy cotton blend': 49389,
 'cotton blend zip': 53691,
 'blend zip front

In [38]:
# Transform the Description column with the bag of words 
description_bow = bow_transformer.transform(df['Description'])

In [39]:
# Fit the tfidf with the bag of words
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
text_tfidf = transformer.fit(description_bow)

In [40]:
# Transform the tfidf with the bag of words
text_tfidf = transformer.transform(description_bow)

In [41]:
# The resulting matrix is in sparse format, we can transform it into dense
# Code prepared for you so you can see what results look like

# Create an array from the transformed 'bag of words' Description column

text_tfidf = pd.DataFrame(text_tfidf.toarray())

In [42]:
# This is an example result, the matrix will contain lots of zero values, that is expected
# Some values will be non-zero
text_tfidf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,238050,238051,238052,238053,238054,238055,238056,238057,238058,238059
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Split Data into Train and Test sets (4 marks)

In [43]:
X = text_tfidf

# Now the Data is Ready for Classifier Usage

In [46]:
df_new = df.copy()

In [98]:
le = preprocessing.LabelEncoder()
df_new['le_description'] = le.fit_transform(df.Description.values)
df_new['le_level1'] = le.fit_transform(df.Level_1.values)
df_new['le_level2'] = le.fit_transform(df.Level_2.values)
df_new['le_level3'] = le.fit_transform(df.Level_3.values)
df_new

Unnamed: 0,Description,Level_1,Level_2,Level_3,le_description,le_level1,le_level2,le_level3
0,gerb cap help keep littl on head cov warm day ...,09BF5150,C7E19,FDCF,3624,1,29,93
1,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,ED0D,6006,2,24,87
2,tut ballet anym leap foxy fash ruffl tul toddl...,09BF5150,C7E19,D06E,9021,1,29,77
3,newborn inf toddl boy hoody jacket oshkosh b g...,2CEC27F1,ADAD6,98CF,6006,2,24,61
4,easy keep feel warm cozy inf toddl girl hoody ...,2CEC27F1,ADAD6,3918,2593,2,24,28
...,...,...,...,...,...,...,...,...
10634,term 10 issu on year subscriptionyo sav 75 cov...,90A8B052,C719A,1BE5,8402,9,28,12
10635,term 12 issu on year subscriptionyo sav 86 cov...,90A8B052,C719A,F45B,8540,9,28,90
10636,term 9 issu on year subscriptionyo sav 64 cov ...,90A8B052,C719A,A0E2,8768,9,28,62
10637,term 26 issu on year subscriptionyo sav 54 cov...,90A8B052,C719A,1BE5,8558,9,28,12


In [99]:
#X = df_new[['le_description', 'le_level1', 'le_level2','le_level3']]

In [47]:
y = df_new[['Level_1', 'Level_2', 'Level_3']]

In [48]:
y.shape

(10627, 3)

In [49]:
y.columns.values.tolist()

['Level_1', 'Level_2', 'Level_3']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.20, random_state = 42)

In [51]:
# You might need to reset index in each dataframe (depends on you how you do things)
# done for you to make it clearer
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

In [52]:
X_train.shape

(8501, 238060)

In [53]:
y_train.shape

(8501, 3)

In [54]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8501 entries, 0 to 8500
Columns: 238060 entries, 0 to 238059
dtypes: float64(238060)
memory usage: 15.1 GB


In [55]:
X_test.shape

(2126, 238060)

In [56]:
y_test.shape

(2126, 3)

In [59]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,238050,238051,238052,238053,238054,238055,238056,238057,238058,238059
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8496,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8498,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8499,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model training for the three levels (8 marks)

In [63]:
# classes for the different levels : y_train
# for some reason assigning the class names to the non-spilt version didn't work
# so assigned classes separately to y_train and y_test
class1 = y_train['Level_1'].astype(str)
class2 = y_train['Level_2'].astype(str)
class3 = y_train['Level_3'].astype(str)

In [64]:
class1.unique()

array(['90A8B052', 'B092BA29', '3E1E0D78', '4C3D8686', 'AAC8EE56',
       '57164AC1', '4513C920', '69286F45', '2CEC27F1', '35E04739',
       '09BF5150', 'EFEF723B', '96F95EEC', '014303D1', 'D410C91A'],
      dtype=object)

In [65]:
# classes for the different levels : y_test
class1_te = y_test['Level_1'].astype(str)
class2_te = y_test['Level_2'].astype(str)
class3_te = y_test['Level_3'].astype(str)

In [66]:
class1_te.unique()

array(['D410C91A', '09BF5150', '2CEC27F1', '57164AC1', '69286F45',
       'AAC8EE56', '3E1E0D78', '4C3D8686', '35E04739', '4513C920',
       'B092BA29', 'EFEF723B', '014303D1', '96F95EEC', '90A8B052'],
      dtype=object)

## Create and save model level 1

In [383]:
model = MultinomialNB()
model.fit(X_train, class1)

MultinomialNB()

In [384]:
# save level 1 classifier
with open('level1.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [385]:
## Here we reload the saved models and use them to predict the levels
# load model for level 1 (done for you)
with open('level1.pk', 'rb') as nb:
    model = pickle.load(nb)

In [386]:
pred_level1 = model.predict(X_test)

In [387]:
#(test[['level_1']], level1_pred)
accuracy_level1 = accuracy_score(class1_te, pred_level1)
accuracy_level1

0.6792097836312324

In [388]:
pred_level1

array(['B092BA29', '09BF5150', '2CEC27F1', ..., 'AAC8EE56', '35E04739',
       '2CEC27F1'], dtype='<U8')

In [389]:
np.unique(pred_level1)

array(['014303D1', '09BF5150', '2CEC27F1', '35E04739', '3E1E0D78',
       '4513C920', '4C3D8686', '57164AC1', '69286F45', '90A8B052',
       '96F95EEC', 'AAC8EE56', 'B092BA29', 'D410C91A', 'EFEF723B'],
      dtype='<U8')

## Create and save models for level 2

In [126]:
# Split the test data into categories for level2 predictions : models c1_0 - c1_14

c1_0 = list(class1[class1 == '014303D1'].index)
c1_1 = list(class1[class1 == '09BF5150'].index)
c1_2 = list(class1[class1 == '2CEC27F1'].index)
c1_3 = list(class1[class1 == '35E04739'].index)
c1_4 = list(class1[class1 == '3E1E0D78'].index)
c1_5 = list(class1[class1 == '4513C920'].index)
c1_6 = list(class1[class1 == '4C3D8686'].index)
c1_7 = list(class1[class1 == '57164AC1'].index) 
c1_8 = list(class1[class1 == '69286F45'].index) 
c1_9 = list(class1[class1 == '90A8B052'].index) 
c1_10 = list(class1[class1 == '96F95EEC'].index)
c1_11 = list(class1[class1 == 'AAC8EE56'].index) 
c1_12 = list(class1[class1 == 'B092BA29'].index)
c1_13 = list(class1[class1 == 'D410C91A'].index) 
c1_14 = list(class1[class1 == 'EFEF723B'].index)

In [75]:
c1_4

[2,
 14,
 18,
 35,
 47,
 85,
 93,
 107,
 124,
 145,
 189,
 208,
 220,
 238,
 277,
 299,
 310,
 324,
 338,
 349,
 366,
 371,
 385,
 392,
 407,
 408,
 415,
 461,
 486,
 487,
 489,
 503,
 505,
 510,
 521,
 564,
 600,
 610,
 625,
 632,
 654,
 662,
 683,
 696,
 701,
 735,
 760,
 764,
 768,
 780,
 805,
 865,
 890,
 905,
 906,
 910,
 987,
 990,
 1019,
 1025,
 1051,
 1052,
 1076,
 1077,
 1115,
 1165,
 1166,
 1183,
 1184,
 1209,
 1258,
 1288,
 1335,
 1373,
 1386,
 1396,
 1416,
 1421,
 1435,
 1441,
 1451,
 1455,
 1467,
 1498,
 1507,
 1525,
 1542,
 1550,
 1558,
 1562,
 1606,
 1619,
 1634,
 1653,
 1655,
 1657,
 1722,
 1751,
 1767,
 1821,
 1823,
 1827,
 1865,
 1879,
 1880,
 1888,
 1910,
 1912,
 1917,
 1920,
 1939,
 1966,
 1973,
 1980,
 2008,
 2030,
 2041,
 2044,
 2059,
 2065,
 2072,
 2076,
 2099,
 2117,
 2154,
 2184,
 2194,
 2206,
 2217,
 2237,
 2247,
 2248,
 2254,
 2280,
 2330,
 2335,
 2336,
 2382,
 2386,
 2390,
 2424,
 2427,
 2431,
 2438,
 2505,
 2521,
 2547,
 2554,
 2557,
 2569,
 2571,
 2579,
 2

In [76]:
X_train.loc[c1_4]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,238050,238051,238052,238053,238054,238055,238056,238057,238058,238059
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8426,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8446,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8450,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [77]:
class2[c1_4]

2       9D9EE
14      9D9EE
18      E6162
35      9D9EE
47      9D9EE
        ...  
8426    9D9EE
8434    E6162
8446    E6162
8450    9D9EE
8477    9D9EE
Name: Level_2, Length: 473, dtype: object

In [78]:
X_train.loc[c1_4], class2[c1_4]

(      0       1       2       3       4       5       6       7       8       \
 2        0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
 14       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
 18       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
 35       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
 47       0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
 ...      ...     ...     ...     ...     ...     ...     ...     ...     ...   
 8426     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
 8434     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
 8446     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
 8450     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
 8477     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
 
       9       ...  238050

In [84]:
# model2: c1_0
#tree.fit(train.loc[x,['x', 'xx','xxx','xxxx','xxxxx']], class2[x])
model.fit(X_train.loc[c1_0], class2[c1_0])

MultinomialNB()

In [85]:
# model2: c1_0:  
with open('c1_0.pk', 'wb') as cls:  
    pickle.dump(model, cls)

In [86]:
with open('c1_0.pk', 'rb') as nb:
    model = pickle.load(nb)

In [87]:
# predict level 2 using level1 (pred_level2)
pred_level2a = model.predict(X_test)
np.unique(pred_level2a)

array(['77F62', '7AED7'], dtype='<U5')

In [88]:
# Level 2 accuracy 
accuracy_level2a = accuracy_score(class2_te, pred_level2a)
accuracy_level2a

0.03386641580432737

In [90]:
# c1_1:
model.fit(X_train.loc[c1_1], class2[c1_1])

MultinomialNB()

In [91]:
# c1_1:  
with open('c1_1.pk', 'wb') as cls:  
    pickle.dump(model, cls)

In [92]:
with open('c1_1.pk', 'rb') as nb:
    model = pickle.load(nb)

In [93]:
# predict level 2 using level1 (pred_level2)
pred_level2b = model.predict(X_test)
np.unique(pred_level2b)

array(['5E038', 'AF6B9', 'C7E19'], dtype='<U5')

In [94]:
# Level 2 accuracy 
accuracy_level2b = accuracy_score(class2_te, pred_level2b)
accuracy_level2b

0.043273753527751646

In [95]:
#c1_2:
model.fit(X_train.loc[c1_2], class2[c1_2])

MultinomialNB()

In [96]:
# c1_2
with open('c1_2.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [97]:
with open('c1_2.pk', 'rb') as nb:
    model = pickle.load(nb)

In [98]:
# predict level 2 using level1 (pred_level2)
pred_level2c = model.predict(X_test)
np.unique(pred_level2c)

array(['ADAD6', 'BAE8A'], dtype='<U5')

In [99]:
accuracy_level2c = accuracy_score(class2_te, pred_level2c)
accuracy_level2c

0.07572906867356538

In [100]:
# c1_3 
model.fit(X_train.loc[c1_3], class2[c1_3])

MultinomialNB()

In [101]:
# c1_3
with open('c1_3.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [102]:
with open('c1_3.pk', 'rb') as nb:
    model = pickle.load(nb)

In [103]:
# predict level 2 using level1 (pred_level2)
pred_level2d = model.predict(X_test)
np.unique(pred_level2d)

array(['390F1', 'B2DB4'], dtype='<U5')

In [104]:
accuracy_level2d = accuracy_score(class2_te, pred_level2d)
accuracy_level2d

0.07243650047036689

In [105]:
# c1_4
model.fit(X_train.loc[c1_4], class2[c1_4])

MultinomialNB()

In [106]:
# c1_4
with open('c1_4.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [107]:
with open('c1_4.pk', 'rb') as nb:
    model = pickle.load(nb)

In [108]:
# predict level 2 using level1 (pred_level2)
pred_level2e = model.predict(X_test)
np.unique(pred_level2e)

array(['9D9EE'], dtype='<U5')

In [109]:
accuracy_level2e = accuracy_score(class2_te, pred_level2e)
accuracy_level2e

0.040921919096895576

In [110]:
# c1_5
model.fit(X_train.loc[c1_5], class2[c1_5])

MultinomialNB()

In [111]:
# c1_5
with open('c1_5.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [112]:
with open('c1_5.pk', 'rb') as nb:
    model = pickle.load(nb)

In [113]:
# predict level 2 using level1 (pred_level2)
pred_level2f = model.predict(X_test)
np.unique(pred_level2f)

array(['31FED', 'E69F5', 'F4055'], dtype='<U5')

In [114]:
accuracy_level2f = accuracy_score(class2_te, pred_level2f)
accuracy_level2f

0.04374412041392286

In [115]:
# c1_6
model.fit(X_train.loc[c1_6], class2[c1_6])

MultinomialNB()

In [116]:
# c1_6
with open('c1_6.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [117]:
with open('c1_6.pk', 'rb') as nb:
    model = pickle.load(nb)

In [118]:
# predict level 2 using level1 (pred_level2)
pred_level2g = model.predict(X_test)
np.unique(pred_level2g)

array(['74974'], dtype='<U5')

In [119]:
accuracy_level2g = accuracy_score(class2_te, pred_level2g)
accuracy_level2g

0.03621825023518344

In [120]:
# c1_7
model.fit(X_train.loc[c1_7], class2[c1_7])

MultinomialNB()

In [121]:
# c1_7
with open('c1_7.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [122]:
with open('c1_7.pk', 'rb') as nb:
    model = pickle.load(nb)

In [123]:
# predict level 2 using level1 (pred_level2)
pred_level2h = model.predict(X_test)
np.unique(pred_level2h)

array(['7B638', '94728'], dtype='<U5')

In [124]:
accuracy_level2h = accuracy_score(class2_te, pred_level2h)
accuracy_level2h

0.06632173095014111

In [None]:
pd.get_dummies(class1)

In [442]:
# c1_8 Model 8 has a problem converting string to float : couldn't fix it
#model.fit(y_train.loc[c1_8], class2[c1_8])

In [167]:
# c1_8
#with open('c1_8.pk', 'wb') as cls:
#    pickle.dump(model, cls)

In [168]:
#with open('c1_8.pk', 'rb') as nb:
#    model = pickle.load(nb)

In [169]:
# predict level 2 using level1 (pred_level2)
#pred_level2i = model.predict(X_test[['le_level1']])
#np.unique(pred_level2i)

array(['5'], dtype='<U1')

In [170]:
#accuracy_level2i = accuracy_score(class2_te, pred_level2i)
#accuracy_level2i

0.07243650047036689

In [128]:
# c1_9
model.fit(X_train.loc[c1_9], class2[c1_9])

MultinomialNB()

In [129]:
# c1_9
with open('c1_9.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [130]:
with open('c1_9.pk', 'rb') as nb:
    model = pickle.load(nb)

In [131]:
# predict level 2 using level1 (pred_level2)
pred_level2j = model.predict(X_test)
np.unique(pred_level2j)

array(['C719A'], dtype='<U5')

In [132]:
accuracy_level2j = accuracy_score(class2_te, pred_level2j)
accuracy_level2j

0.04421448730009407

In [133]:
# c1_10
model.fit(X_train.loc[c1_10], class2[c1_10])

MultinomialNB()

In [134]:
# c1_10
with open('c1_10.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [135]:
with open('c1_10.pk', 'rb') as nb:
    model = pickle.load(nb)

In [136]:
# predict level 2 using level1 (pred_level2)
pred_level2k = model.predict(X_test)
np.unique(pred_level2k)

array(['36080', 'A04D3'], dtype='<U5')

In [137]:
accuracy_level2k = accuracy_score(class2_te, pred_level2k)
accuracy_level2k

0.04374412041392286

In [138]:
# c1_11
model.fit(X_train.loc[c1_11], class2[c1_11])

MultinomialNB()

In [139]:
# c1_11
with open('c1_11.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [140]:
with open('c1_11.pk', 'rb') as nb:
    model = pickle.load(nb)

In [141]:
# predict level 2 using level1 (pred_level2)
pred_level2l = model.predict(X_test)
np.unique(pred_level2l)

array(['914A1', '9B69F'], dtype='<U5')

In [142]:
accuracy_level2l = accuracy_score(class2_te, pred_level2l)
accuracy_level2l

0.07149576669802446

In [143]:
# c1_12
model.fit(X_train.loc[c1_12], class2[c1_12])

MultinomialNB()

In [144]:
# c1_12
with open('c1_12.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [145]:
with open('c1_12.pk', 'rb') as nb:
    model = pickle.load(nb)

In [146]:
# predict level 2 using level1 (pred_level2)
pred_level2m = model.predict(X_test)
np.unique(pred_level2m)

array(['375FE', '5A8AB'], dtype='<U5')

In [147]:
accuracy_level2m = accuracy_score(class2_te, pred_level2m)
accuracy_level2m

0.045625587958607716

In [148]:
# c1_13
model.fit(X_train.loc[c1_13], class2[c1_13])

MultinomialNB()

In [149]:
# c1_13
with open('c1_13.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [150]:
with open('c1_13.pk', 'rb') as nb:
    model = pickle.load(nb)

In [151]:
# predict level 2 using level1 (pred_level2)
pred_level2n = model.predict(X_test)
np.unique(pred_level2n)

array(['ACD06'], dtype='<U5')

In [152]:
accuracy_level2n = accuracy_score(class2_te, pred_level2n)
accuracy_level2n

0.04421448730009407

In [153]:
# c1_14
model.fit(X_train.loc[c1_14], class2[c1_14])

MultinomialNB()

In [154]:
# c1_14
with open('c1_14.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [155]:
with open('c1_14.pk', 'rb') as nb:
    model = pickle.load(nb)

In [156]:
# predict level 2 using level1 (pred_level2)
pred_level2o = model.predict(X_test)
np.unique(pred_level2o)

array(['02FA0', 'CB803'], dtype='<U5')

In [157]:
accuracy_level2o = accuracy_score(class2_te, pred_level2o)
accuracy_level2o

0.061618062088428974

## Create and save models for level 3


In [163]:
# Combined unique combinations of level1 and level2 to predict level3 - all combinations
y_train.groupby(['Level_1','Level_2']).size().reset_index().rename(columns={0:'count'})

Unnamed: 0,Level_1,Level_2,count
0,014303D1,77F62,199
1,014303D1,7AED7,224
2,09BF5150,262E7,46
3,09BF5150,5E038,93
4,09BF5150,6C6B1,26
5,09BF5150,915D4,37
6,09BF5150,AF6B9,33
7,09BF5150,C7E19,343
8,09BF5150,F824F,51
9,2CEC27F1,ADAD6,321


In [164]:
c2_1 = list(class1[class1 == '014303D1'].index & class2[class2 == '77F62'] .index)
c2_2 = list(class1[class1 == '014303D1'].index & class2[class2 == '7AED7'] .index)
c2_3 = list(class1[class1 == '09BF5150'].index & class2[class2 == '262E7'] .index)
c2_4 = list(class1[class1 == '09BF5150'].index & class2[class2 == '5E038'] .index)
c2_5 = list(class1[class1 == '09BF5150'].index & class2[class2 == '6C6B1'] .index)
c2_6 = list(class1[class1 == '09BF5150'].index & class2[class2 == '915D4'] .index)
c2_7 = list(class1[class1 == '09BF5150'].index & class2[class2 == 'AF6B9'] .index)
c2_8 = list(class1[class1 == '09BF5150'].index & class2[class2 == 'C7E19'] .index)
c2_9 = list(class1[class1 == '09BF5150'].index & class2[class2 == 'F824F'] .index)
c2_10 = list(class1[class1 == '2CEC27F1'].index & class2[class2 == 'ADAD6'] .index)
c2_11 = list(class1[class1 == '2CEC27F1'].index & class2[class2 == 'BAE8A'] .index)
c2_12 = list(class1[class1 == '35E04739'].index & class2[class2 == '390F1'] .index)
c2_13 = list(class1[class1 == '35E04739'].index & class2[class2 == 'B2DB4'] .index)
c2_14 = list(class1[class1 == '3E1E0D78'].index & class2[class2 == '9D9EE'] .index)
c2_15 = list(class1[class1 == '3E1E0D78'].index & class2[class2 == 'E6162'] .index)
c2_16 = list(class1[class1 == '4513C920'].index & class2[class2 == '31FED'] .index)
c2_17 = list(class1[class1 == '4513C920'].index & class2[class2 == 'E69F5'] .index)
c2_18 = list(class1[class1 == '4513C920'].index & class2[class2 == 'F4055'] .index)
c2_19 = list(class1[class1 == '4C3D8686'].index & class2[class2 == '223B2'] .index)
c2_20 = list(class1[class1 == '4C3D8686'].index & class2[class2 == '74974'] .index)
c2_21 = list(class1[class1 == '57164AC1'].index & class2[class2 == '0864A'] .index)
c2_22 = list(class1[class1 == '57164AC1'].index & class2[class2 == '7B638'] .index)
c2_23 = list(class1[class1 == '57164AC1'].index & class2[class2 == '94728'] .index)
c2_24 = list(class1[class1 == '69286F45'].index & class2[class2 == '2D5A3'] .index)
c2_25 = list(class1[class1 == '90A8B052'].index & class2[class2 == '08960'] .index)
c2_26 = list(class1[class1 == '90A8B052'].index & class2[class2 == 'C719A'] .index)
c2_27 = list(class1[class1 == '96F95EEC'].index & class2[class2 == '36080'] .index)
c2_28 = list(class1[class1 == '96F95EEC'].index & class2[class2 == 'A04D3'] .index)
c2_29 = list(class1[class1 == 'AAC8EE56'].index & class2[class2 == '914A1'] .index)
c2_30 = list(class1[class1 == 'AAC8EE56'].index & class2[class2 == '9B69F'] .index)
c2_31 = list(class1[class1 == 'B092BA29'].index & class2[class2 == '375FE'] .index)
c2_32 = list(class1[class1 == 'B092BA29'].index & class2[class2 == '5A8AB'] .index)
c2_33 = list(class1[class1 == 'D410C91A'].index & class2[class2 == 'ACD06'] .index)
c2_34 = list(class1[class1 == 'EFEF723B'].index & class2[class2 == '02FA0'] .index)
c2_35 = list(class1[class1 == 'EFEF723B'].index & class2[class2 == 'CB803'] .index) 
c2_36 = list(class1[class1 == 'EFEF723B'].index & class2[class2 == 'D5531'] .index)

In [165]:
y_train.loc[c2_1,['Level_1','Level_2','Level_3']]

Unnamed: 0,Level_1,Level_2,Level_3
128,014303D1,77F62,E8E0
143,014303D1,77F62,E8E0
144,014303D1,77F62,E8E0
153,014303D1,77F62,5AE1
174,014303D1,77F62,5AE1
...,...,...,...
8337,014303D1,77F62,5AE1
8357,014303D1,77F62,5AE1
8377,014303D1,77F62,0ED4
8433,014303D1,77F62,0ED4


In [166]:
class3[c2_1]

128     E8E0
143     E8E0
144     E8E0
153     5AE1
174     5AE1
        ... 
8337    5AE1
8357    5AE1
8377    0ED4
8433    0ED4
8487    0EBC
Name: Level_3, Length: 199, dtype: object

In [167]:
# c2_1: 
model.fit(X_train.loc[c2_1], class3[c2_1])

MultinomialNB()

In [168]:
# c2_1
with open('c2_1.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [169]:
with open('c2_1.pk', 'rb') as nb:
    model = pickle.load(nb)

In [170]:
# predict level 3 using level2 
pred_level3a = model.predict(X_test)
np.unique(pred_level3a)

array(['0EBC', '5AE1'], dtype='<U4')

In [171]:
accuracy_level3a = accuracy_score(class3_te, pred_level3a)
accuracy_level3a

0.0061147695202257765

In [172]:
# c2_2: 
model.fit(X_train.loc[c2_2], class3[c2_2])

MultinomialNB()

In [173]:
# c2_2
with open('c2_2.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [174]:
with open('c2_2.pk', 'rb') as nb:
    model = pickle.load(nb)

In [175]:
# predict level 3 using level2 
pred_level3b = model.predict(X_test)
np.unique(pred_level3b)

array(['16BD', '6539', 'BBA5'], dtype='<U4')

In [176]:
accuracy_level3b = accuracy_score(class3_te, pred_level3b)
accuracy_level3b

0.013640639698965193

In [177]:
# c2_3: 
model.fit(X_train.loc[c2_3], class3[c2_3])

MultinomialNB()

In [178]:
# c2_3
with open('c2_3.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [179]:
with open('c2_3.pk', 'rb') as nb:
    model = pickle.load(nb)

In [180]:
# predict level 3 using level2 
pred_level3c = model.predict(X_test)
np.unique(pred_level3c)

array(['29B3'], dtype='<U4')

In [181]:
accuracy_level3c = accuracy_score(class3_te, pred_level3c)
accuracy_level3c

0.00799623706491063

In [182]:
# c2_4: 
model.fit(X_train.loc[c2_4], class3[c2_4])

MultinomialNB()

In [183]:
# c2_4
with open('c2_4.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [184]:
with open('c2_4.pk', 'rb') as nb:
    model = pickle.load(nb)

In [185]:
# predict level 3 using level2 
pred_level3d = model.predict(X_test)
np.unique(pred_level3d)

array(['6BE5'], dtype='<U4')

In [186]:
accuracy_level3d = accuracy_score(class3_te, pred_level3d)
accuracy_level3d

0.010348071495766699

In [187]:
# c2_5: 
model.fit(X_train.loc[c2_5], class3[c2_5])

MultinomialNB()

In [188]:
# c2_5
with open('c2_5.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [189]:
with open('c2_5.pk', 'rb') as nb:
    model = pickle.load(nb)

In [190]:
# predict level 3 using level2 
pred_level3e = model.predict(X_test)
np.unique(pred_level3e)

array(['3AAD'], dtype='<U4')

In [191]:
accuracy_level3e = accuracy_score(class3_te, pred_level3e)
accuracy_level3e

0.004703668861712135

In [192]:
# c2_6: 
model.fit(X_train.loc[c2_6], class3[c2_6])

MultinomialNB()

In [193]:
# c2_6
with open('c2_6.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [194]:
with open('c2_6.pk', 'rb') as nb:
    model = pickle.load(nb)

In [195]:
# predict level 3 using level2 
pred_level3f = model.predict(X_test)
np.unique(pred_level3f)

array(['A2FA'], dtype='<U4')

In [196]:
accuracy_level3f = accuracy_score(class3_te, pred_level3f)
accuracy_level3f

0.004703668861712135

In [197]:
# c2_7: 
model.fit(X_train.loc[c2_7], class3[c2_7])

MultinomialNB()

In [198]:
# c2_7
with open('c2_7.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [199]:
with open('c2_7.pk', 'rb') as nb:
    model = pickle.load(nb)

In [200]:
# predict level 3 using level2 
pred_level3g = model.predict(X_test)
np.unique(pred_level3g)

array(['A104'], dtype='<U4')

In [201]:
accuracy_level3g = accuracy_score(class3_te, pred_level3g)
accuracy_level3g

0.0014111006585136407

In [202]:
# c2_8: 
model.fit(X_train.loc[c2_8], class3[c2_8])

MultinomialNB()

In [203]:
# c2_8
with open('c2_8.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [204]:
with open('c2_8.pk', 'rb') as nb:
    model = pickle.load(nb)

In [205]:
# predict level 3 using level2 
pred_level3h = model.predict(X_test)
np.unique(pred_level3h)

array(['D06E', 'FDCF'], dtype='<U4')

In [206]:
accuracy_level3h = accuracy_score(class3_te, pred_level3h)
accuracy_level3h

0.015522107243650047

In [207]:
# c2_9: 
model.fit(X_train.loc[c2_9], class3[c2_9])

MultinomialNB()

In [208]:
# c2_9
with open('c2_9.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [209]:
with open('c2_9.pk', 'rb') as nb:
    model = pickle.load(nb)

In [210]:
# predict level 3 using level2 
pred_level3i = model.predict(X_test)
np.unique(pred_level3i)

array(['7288'], dtype='<U4')

In [211]:
accuracy_level3i = accuracy_score(class3_te, pred_level3i)
accuracy_level3i

0.009877704609595485

In [212]:
# c2_10: 
model.fit(X_train.loc[c2_10], class3[c2_10])

MultinomialNB()

In [213]:
# c2_10
with open('c2_10.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [214]:
with open('c2_10.pk', 'rb') as nb:
    model = pickle.load(nb)

In [215]:
# predict level 3 using level2 
pred_level3j = model.predict(X_test)
np.unique(pred_level3j)

array(['3918', '98CF', 'ED0D'], dtype='<U4')

In [216]:
accuracy_level3j = accuracy_score(class3_te, pred_level3j)
accuracy_level3j

0.022107243650047036

In [217]:
# c2_11: 
model.fit(X_train.loc[c2_11], class3[c2_11])

MultinomialNB()

In [218]:
# c2_11
with open('c2_11.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [219]:
with open('c2_11.pk', 'rb') as nb:
    model = pickle.load(nb)

In [220]:
# predict level 3 using level2 
pred_level3k = model.predict(X_test)
np.unique(pred_level3k)

array(['20C0', '2ABA', '3E82'], dtype='<U4')

In [221]:
accuracy_level3k = accuracy_score(class3_te, pred_level3k)
accuracy_level3k

0.016933207902163686

In [222]:
# c2_12: 
model.fit(X_train.loc[c2_12], class3[c2_12])

MultinomialNB()

In [223]:
# c2_12
with open('c2_12.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [224]:
with open('c2_12.pk', 'rb') as nb:
    model = pickle.load(nb)

In [225]:
# predict level 3 using level2 
pred_level3l = model.predict(X_test)
np.unique(pred_level3l)

array(['6856', '695D', '8B36', '8C88'], dtype='<U4')

In [226]:
accuracy_level3l = accuracy_score(class3_te, pred_level3l)
accuracy_level3l

0.015522107243650047

In [227]:
# c2_13: 
model.fit(X_train.loc[c2_13], class3[c2_13])

MultinomialNB()

In [228]:
# c2_13
with open('c2_13.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [229]:
with open('c2_13.pk', 'rb') as nb:
    model = pickle.load(nb)

In [230]:
# predict level 3 using level2 
pred_level3m = model.predict(X_test)
np.unique(pred_level3m)

array(['21DA', 'AE8B', 'D436'], dtype='<U4')

In [231]:
accuracy_level3m = accuracy_score(class3_te, pred_level3m)
accuracy_level3m

0.01975540921919097

In [232]:
# c2_14: 
model.fit(X_train.loc[c2_14], class3[c2_14])

MultinomialNB()

In [233]:
# c2_14
with open('c2_14.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [234]:
with open('c2_14.pk', 'rb') as nb:
    model = pickle.load(nb)

In [235]:
# predict level 3 using level2 
pred_level3n = model.predict(X_test)
np.unique(pred_level3n)

array(['05A0', '818C', 'E3E0'], dtype='<U4')

In [236]:
accuracy_level3n = accuracy_score(class3_te, pred_level3n)
accuracy_level3n

0.014111006585136407

In [237]:
# c2_15: 
model.fit(X_train.loc[c2_15], class3[c2_15])

MultinomialNB()

In [238]:
# c2_15
with open('c2_15.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [239]:
with open('c2_15.pk', 'rb') as nb:
    model = pickle.load(nb)

In [240]:
# predict level 3 using level2 
pred_level3o = model.predict(X_test)
np.unique(pred_level3o)

array(['2E14'], dtype='<U4')

In [241]:
accuracy_level3o = accuracy_score(class3_te, pred_level3o)
accuracy_level3o

0.008936970837253057

In [242]:
# c2_16: 
model.fit(X_train.loc[c2_16], class3[c2_16])

MultinomialNB()

In [243]:
# c2_16
with open('c2_16.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [244]:
with open('c2_16.pk', 'rb') as nb:
    model = pickle.load(nb)

In [245]:
# predict level 3 using level2 
pred_level3p = model.predict(X_test)
np.unique(pred_level3p)

array(['3DD3'], dtype='<U4')

In [246]:
accuracy_level3p = accuracy_score(class3_te, pred_level3p)
accuracy_level3p

0.0051740357478833494

In [247]:
# c2_17: 
model.fit(X_train.loc[c2_17], class3[c2_17])

MultinomialNB()

In [248]:
# c2_17
with open('c2_17.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [249]:
with open('c2_17.pk', 'rb') as nb:
    model = pickle.load(nb)

In [250]:
# predict level 3 using level2 
pred_level3q = model.predict(X_test)
np.unique(pred_level3q)

array(['DDD5'], dtype='<U4')

In [251]:
accuracy_level3q = accuracy_score(class3_te, pred_level3q)
accuracy_level3q

0.011759172154280339

In [252]:
# c2_18: 
model.fit(X_train.loc[c2_18], class3[c2_18])

MultinomialNB()

In [253]:
# c2_18
with open('c2_18.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [254]:
with open('c2_18.pk', 'rb') as nb:
    model = pickle.load(nb)

In [255]:
# predict level 3 using level2 
pred_level3r = model.predict(X_test)
np.unique(pred_level3r)

array(['1F75', '5B02', '7C00', 'B183'], dtype='<U4')

In [256]:
accuracy_level3r = accuracy_score(class3_te, pred_level3r)
accuracy_level3r

0.01317027281279398

In [257]:
# c2_19: 
model.fit(X_train.loc[c2_19], class3[c2_19])

MultinomialNB()

In [258]:
# c2_19
with open('c2_19.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [259]:
# c2_19
with open('c2_19.pk', 'rb') as nb:
    model = pickle.load(nb)

In [260]:
# predict level 3 using level2 
pred_level3s = model.predict(X_test)
np.unique(pred_level3s)

array(['F213'], dtype='<U4')

In [261]:
accuracy_level3s = accuracy_score(class3_te, pred_level3s)
accuracy_level3s

0.0051740357478833494

In [None]:
# c2_20: 
model.fit(X_train.loc[c2_20], class3[c2_20])

In [262]:
# c2_20
with open('c2_20.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [263]:
# c2_20
with open('c2_20.pk', 'rb') as nb:
    model = pickle.load(nb)

In [264]:
# predict level 3 using level2 
pred_level3t = model.predict(X_test)
np.unique(pred_level3t)

array(['F213'], dtype='<U4')

In [265]:
accuracy_level3t = accuracy_score(class3_te, pred_level3t)
accuracy_level3t

0.0051740357478833494

In [266]:
# c2_21: 
model.fit(X_train.loc[c2_21], class3[c2_21])

MultinomialNB()

In [267]:
# c2_21
with open('c2_21.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [268]:
# c2_21
with open('c2_21.pk', 'rb') as nb:
    model = pickle.load(nb)

In [269]:
# predict level 3 using level2 
pred_level3u = model.predict(X_test)
np.unique(pred_level3u)

array(['96B8'], dtype='<U4')

In [270]:
accuracy_level3u = accuracy_score(class3_te, pred_level3u)
accuracy_level3u

0.0009407337723424271

In [271]:
# c2_22: 
model.fit(X_train.loc[c2_22], class3[c2_22])

MultinomialNB()

In [272]:
# c2_22
with open('c2_22.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [273]:
# c2_22
with open('c2_22.pk', 'rb') as nb:
    model = pickle.load(nb)

In [274]:
# predict level 3 using level2 
pred_level3v = model.predict(X_test)
np.unique(pred_level3v)

array(['0F8B', '2C26'], dtype='<U4')

In [275]:
accuracy_level3v = accuracy_score(class3_te, pred_level3v)
accuracy_level3v

0.01975540921919097

In [276]:
# c2_23: 
model.fit(X_train.loc[c2_23], class3[c2_23])

MultinomialNB()

In [277]:
# c2_23
with open('c2_23.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [278]:
# c2_23
with open('c2_23.pk', 'rb') as nb:
    model = pickle.load(nb)

In [279]:
# predict level 3 using level2 
pred_level3w = model.predict(X_test)
np.unique(pred_level3w)

array(['5912', 'BB6B'], dtype='<U4')

In [280]:
accuracy_level3w = accuracy_score(class3_te, pred_level3w)
accuracy_level3w

0.017873941674506115

In [281]:
# c2_24: 
model.fit(X_train.loc[c2_24], class3[c2_24])

MultinomialNB()

In [282]:
# c2_24
with open('c2_24.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [283]:
# c2_24
with open('c2_24.pk', 'rb') as nb:
    model = pickle.load(nb)

In [284]:
# predict level 3 using level2 
pred_level3x = model.predict(X_test)
np.unique(pred_level3x)

array(['0CB9', '28A7', '4A72'], dtype='<U4')

In [285]:
accuracy_level3x = accuracy_score(class3_te, pred_level3x)
accuracy_level3x

0.02916274694261524

In [286]:
# c2_25: 
model.fit(X_train.loc[c2_25], class3[c2_25])

MultinomialNB()

In [287]:
# c2_25
with open('c2_25.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [288]:
# c2_25
with open('c2_25.pk', 'rb') as nb:
    model = pickle.load(nb)

In [289]:
# predict level 3 using level2 
pred_level3y = model.predict(X_test)
np.unique(pred_level3y)

array(['1000'], dtype='<U4')

In [290]:
accuracy_level3y = accuracy_score(class3_te, pred_level3y)
accuracy_level3y

0.0037629350893697085

In [291]:
# c2_26: 
model.fit(X_train.loc[c2_26], class3[c2_26])

MultinomialNB()

In [292]:
# c2_26
with open('c2_26.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [293]:
# c2_26
with open('c2_26.pk', 'rb') as nb:
    model = pickle.load(nb)

In [294]:
# predict level 3 using level2 
pred_level3z = model.predict(X_test)
np.unique(pred_level3z)

array(['A0E2'], dtype='<U4')

In [295]:
accuracy_level3z = accuracy_score(class3_te, pred_level3z)
accuracy_level3z

0.023047977422389464

In [296]:
# c2_27: 
model.fit(X_train.loc[c2_27], class3[c2_27])

MultinomialNB()

In [297]:
# c2_27
with open('c2_27.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [298]:
# c2_27
with open('c2_27.pk', 'rb') as nb:
    model = pickle.load(nb)

In [299]:
# predict level 3 using level2 
pred_level3aa = model.predict(X_test)
np.unique(pred_level3aa)

array(['5F7C', 'C563'], dtype='<U4')

In [300]:
accuracy_level3aa = accuracy_score(class3_te, pred_level3aa)
accuracy_level3aa

0.00658513640639699

In [301]:
# c2_28: 
model.fit(X_train.loc[c2_28], class3[c2_28])

MultinomialNB()

In [302]:
# c2_28
with open('c2_28.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [303]:
# c2_28
with open('c2_28.pk', 'rb') as nb:
    model = pickle.load(nb)

In [304]:
# predict level 3 using level2 
pred_level3bb = model.predict(X_test)
np.unique(pred_level3bb)

array(['3E60', 'C5B4'], dtype='<U4')

In [305]:
accuracy_level3bb = accuracy_score(class3_te, pred_level3bb)
accuracy_level3bb

0.021636876763875823

In [306]:
# c2_29: 
model.fit(X_train.loc[c2_29], class3[c2_29])

MultinomialNB()

In [307]:
# c2_29
with open('c2_29.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [308]:
# c2_29
with open('c2_29.pk', 'rb') as nb:
    model = pickle.load(nb)

In [309]:
# predict level 3 using level2 
pred_level3cc = model.predict(X_test)
np.unique(pred_level3cc)

array(['D97D', 'F72B'], dtype='<U4')

In [310]:
accuracy_level3cc = accuracy_score(class3_te, pred_level3cc)
accuracy_level3cc

0.015522107243650047

In [311]:
# c2_30: 
model.fit(X_train.loc[c2_30], class3[c2_30])

MultinomialNB()

In [312]:
# c2_30
with open('c2_30.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [313]:
# c2_30
with open('c2_30.pk', 'rb') as nb:
    model = pickle.load(nb)

In [314]:
# predict level 3 using level2 
pred_level3dd = model.predict(X_test)
np.unique(pred_level3dd)

array(['80C4', 'D579'], dtype='<U4')

In [315]:
accuracy_level3dd = accuracy_score(class3_te, pred_level3dd)
accuracy_level3dd

0.01458137347130762

In [316]:
# c2_31: 
model.fit(X_train.loc[c2_31], class3[c2_31])

MultinomialNB()

In [317]:
# c2_31
with open('c2_31.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [318]:
# c2_31
with open('c2_31.pk', 'rb') as nb:
    model = pickle.load(nb)

In [319]:
# predict level 3 using level2 
pred_level3ee = model.predict(X_test)
np.unique(pred_level3ee)

array(['1F61'], dtype='<U4')

In [320]:
accuracy_level3ee = accuracy_score(class3_te, pred_level3ee)
accuracy_level3ee

0.016462841015992474

In [321]:
# c2_32: 
model.fit(X_train.loc[c2_32], class3[c2_32])

MultinomialNB()

In [322]:
# c2_32
with open('c2_32.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [323]:
# c2_32
with open('c2_32.pk', 'rb') as nb:
    model = pickle.load(nb)

In [324]:
# predict level 3 using level2 
pred_level3ff = model.predict(X_test)
np.unique(pred_level3ff)

array(['2CFE', 'AA6B'], dtype='<U4')

In [325]:
accuracy_level3ff = accuracy_score(class3_te, pred_level3ff)
accuracy_level3ff

0.02634054562558796

In [326]:
# c2_33: 
model.fit(X_train.loc[c2_33], class3[c2_33])

MultinomialNB()

In [327]:
# c2_33
with open('c2_33.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [328]:
# c2_33
with open('c2_33.pk', 'rb') as nb:
    model = pickle.load(nb)

In [329]:
# predict level 3 using level2 
pred_level3gg = model.predict(X_test)
np.unique(pred_level3gg)

array(['33D1', '9203'], dtype='<U4')

In [330]:
accuracy_level3gg = accuracy_score(class3_te, pred_level3gg)
accuracy_level3gg

0.01975540921919097

In [331]:
# c2_34: 
model.fit(X_train.loc[c2_34], class3[c2_34])

MultinomialNB()

In [332]:
# c2_34
with open('c2_34.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [333]:
# c2_34
with open('c2_34.pk', 'rb') as nb:
    model = pickle.load(nb)

In [334]:
# predict level 3 using level2 
pred_level3hh = model.predict(X_test)
np.unique(pred_level3hh)

array(['02B3', '078B'], dtype='<U4')

In [335]:
accuracy_level3hh = accuracy_score(class3_te, pred_level3hh)
accuracy_level3hh

0.013640639698965193

In [336]:
# c2_35: 
model.fit(X_train.loc[c2_35], class3[c2_35])

MultinomialNB()

In [337]:
# c2_35
with open('c2_35.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [338]:
# c2_35
with open('c2_35.pk', 'rb') as nb:
    model = pickle.load(nb)

In [339]:
# predict level 3 using level2 
pred_level3ii = model.predict(X_test)
np.unique(pred_level3ii)

array(['2C15', '5BE9', '627D', 'FA9E'], dtype='<U4')

In [340]:
accuracy_level3ii = accuracy_score(class3_te, pred_level3ii)
accuracy_level3ii

0.017873941674506115

In [341]:
# c2_36: 
model.fit(X_train.loc[c2_36], class3[c2_36])

MultinomialNB()

In [342]:
# c2_36
with open('c2_36.pk', 'wb') as cls:
    pickle.dump(model, cls)

In [343]:
# c2_36
with open('c2_36.pk', 'rb') as nb:
    model = pickle.load(nb)

In [344]:
# predict level 3 using level2 
pred_level3jj = model.predict(X_test)
np.unique(pred_level3jj)

array(['6253'], dtype='<U4')

In [345]:
accuracy_level3jj = accuracy_score(class3_te, pred_level3jj)
accuracy_level3jj

0.007525870178739417

## Predict the test set (8 marks)

In [281]:
# Creating an empty Dataframe with column names only (depends on you how you do things)
#results = pd.DataFrame(columns=['Level1_Pred', 'Level2_Pred', 'Level3_Pred'])

## loop through the test data, predict level 1, then based on that predict level 2
## and based on level 2 predict level 3 (you need to load saved models accordingly)
#with open('level1.pk', 'rb') as nb:
#    model = pickle.load(nb)


In [390]:
predict_level1 = pd.DataFrame((pred_level1),columns = ['Pred_Level1'])
predict_level1

Unnamed: 0,Pred_Level1
0,B092BA29
1,09BF5150
2,2CEC27F1
3,57164AC1
4,57164AC1
...,...
2121,35E04739
2122,B092BA29
2123,AAC8EE56
2124,35E04739


In [419]:
pred_level2 = pd.DataFrame (np.stack((pred_level2a, pred_level2b, pred_level2c, pred_level2d, pred_level2e, pred_level2f,
               pred_level2g, pred_level2h, pred_level2j, pred_level2k,
               pred_level2l, pred_level2m, pred_level2n, pred_level2o), axis = 1))

In [420]:
pred_level2 = pd.DataFrame((pred_level2a), columns = ['Pred_Level2'])

In [421]:
np.unique(pred_level2)

array(['77F62', '7AED7'], dtype=object)

In [422]:
pred_level3 = pd.DataFrame(np.stack((pred_level3a, pred_level3b, pred_level3c, pred_level3d, pred_level3e,
                        pred_level3f, pred_level3g, pred_level3h, pred_level3i, pred_level3j,
                        pred_level3k, pred_level3l, pred_level3m, pred_level3n, pred_level3o), axis = 1))

In [433]:
results = pd.DataFrame(np.stack((pred_level1, pred_level2a, pred_level3a), axis = 1))

In [436]:
results

Unnamed: 0,0,1,2
0,B092BA29,7AED7,5AE1
1,09BF5150,7AED7,5AE1
2,2CEC27F1,7AED7,5AE1
3,57164AC1,7AED7,5AE1
4,57164AC1,7AED7,5AE1
...,...,...,...
2121,35E04739,7AED7,5AE1
2122,B092BA29,7AED7,5AE1
2123,AAC8EE56,7AED7,5AE1
2124,35E04739,7AED7,5AE1


## Compute Accuracy on each level (4 marks)
Now you have the predictions for each level (in the test data), and you also have the actual levels, you can compute the accurcay

In [437]:
# Level 1 accuracy
accuracy_level1 = accuracy_score(class1_te, pred_level1)
accuracy_level1

0.6792097836312324

In [439]:
# Level 2 accuracy 
accuracy_level2 = pd.DataFrame(np.stack((accuracy_level2a, accuracy_level2b, accuracy_level2c, accuracy_level2d, accuracy_level2e,
                        accuracy_level2f, accuracy_level2g, accuracy_level2h, accuracy_level2j,
                        accuracy_level2k, accuracy_level2l, accuracy_level2m, accuracy_level2n, accuracy_level2o)),columns = ['accuracy_level2'])

In [297]:
accuracy_level2

Unnamed: 0,accuracy_level2
0,0.027281
1,0.040452
2,0.040452
3,0.043744
4,0.040922
5,0.03857
6,0.036218
7,0.043274
8,0.072437
9,0.044214


In [440]:
# Level 3 accuracy 
accuracy_level3 = pd.DataFrame(np.stack((accuracy_level3a, accuracy_level3b, accuracy_level3c, accuracy_level3d, accuracy_level3e,
                        accuracy_level3f, accuracy_level3g, accuracy_level3h, accuracy_level3i, accuracy_level3j,
                        accuracy_level3k, accuracy_level3l, accuracy_level3m, accuracy_level3n, accuracy_level3o)),columns = ['accuracy_level3'])

In [441]:
accuracy_level3

Unnamed: 0,accuracy_level3
0,0.006115
1,0.013641
2,0.007996
3,0.010348
4,0.004704
5,0.004704
6,0.001411
7,0.015522
8,0.009878
9,0.022107


## Well done!