In [1006]:
!pip install dask

In [1007]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import dask.dataframe as dd

In [1008]:
df = dd.read_csv('OSX_DS_assignment.csv').head(n=2000)

In [1009]:
df.head()

Unnamed: 0,user_name,country,review_title,review_description,designation,points,price,province,region_1,region_2,winery,variety
0,@kerinokeefe,Italy,Nicosia 2013 Vulkà Bianco (Etna),"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Nicosia,White Blend
1,@vossroger,Portugal,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Quinta dos Avidagos,Portuguese Red
2,@paulgwine,US,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm,Pinot Gris
3,,US,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,St. Julian,Riesling
4,@paulgwine,US,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks,Pinot Noir


<font size="6">**Data Cleaning**</font>

In [1010]:
len(df) 

2000

<font size="3">**First let's take care of the columns with null values**</font>

In [1011]:
df.isnull().sum() 

user_name              448
country                  0
review_title             0
review_description       0
designation            592
points                   0
price                  128
province                 0
region_1               305
region_2              1168
winery                   0
variety                  0
dtype: int64

In [1012]:
df = df.drop(['region_2','designation'],axis=1)

<font size="3">**It is better to drop 'region_2' column as half of the entries of the column are null and 'designation' column as around one fourth of the entries of the column are null**</font>


<font size="3">**Now let us deal with the column 'country', 'province' and 'region_1'**</font>

In [1013]:
null_rows = df[df.isnull().any(axis=1)]

null_cols = df.columns[df.isnull().any()]
null_cols_in_country_rows = df[null_cols][df['country'].isnull()]

print(null_cols_in_country_rows)

Empty DataFrame
Columns: [user_name, price, region_1]
Index: []


<font size="3">**We can see that country and province have 39 null values each. So we can assume that the both columns have null rows in the same index as each other. But for confirmation we will still check.'**</font>

<font size="3">**Now we have to fill null values in country, province and region_1 with a value indicating that they are correlated. So lets fill null values of column country with 'other country' and province column with 'other province'. And fill region_1 with 'other_region' where country = other country and province = other province. It is because if dont know the country and province of a row, finding region_1 for those doesn't make sense'**</font>

In [1014]:
df['country'].fillna('other_country', inplace=True)

In [1015]:
df['province'].fillna('other_province', inplace=True)

In [1016]:
df.loc[df['province'] == 'other_province', 'region_1'] = df.loc[df['province'] == 'other_province', 'region_1'].fillna('other_region')

In [1017]:
df.isnull().sum() 

user_name             448
country                 0
review_title            0
review_description      0
points                  0
price                 128
province                0
region_1              305
winery                  0
variety                 0
dtype: int64

<font size="3" > **if we check now there are no null values in country and province column but the region_1 column has 16029 null values compared to 16068 earlier. It's because (16068-16029=39 as we imputed 39 null values for country and province)** <font>

In [1018]:
# checking the rows with more than 2 null values in their columns
null_counts = df.isnull().sum(axis=1)
more_than_2_nulls = df[null_counts >2]
print(len(more_than_2_nulls))

0


<font size="3" > **We have 583 rows with more than 2 null columns, it is better to drop them** <font>

In [1019]:
null_counts = df.isnull().sum(axis=1)
drop_rows = df[null_counts > 2].index
df.drop(drop_rows, inplace=True)

In [1020]:
df.head()

Unnamed: 0,user_name,country,review_title,review_description,points,price,province,region_1,winery,variety
0,@kerinokeefe,Italy,Nicosia 2013 Vulkà Bianco (Etna),"Aromas include tropical fruit, broom, brimston...",87,,Sicily & Sardinia,Etna,Nicosia,White Blend
1,@vossroger,Portugal,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,,Quinta dos Avidagos,Portuguese Red
2,@paulgwine,US,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm,Pinot Gris
3,,US,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,St. Julian,Riesling
4,@paulgwine,US,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Sweet Cheeks,Pinot Noir


In [1021]:
df.isnull().sum()

user_name             448
country                 0
review_title            0
review_description      0
points                  0
price                 128
province                0
region_1              305
winery                  0
variety                 0
dtype: int64

<font size="3" > **Now let us see how many null values are there for user_name column and compare them with country column so we can see how many countries have null user_name** <font>

In [1022]:
null_usernames = df[df['user_name'].isnull()]['country']

null_counts = null_usernames.value_counts() # Count the number of null values in each country group

sorted_null_counts = null_counts.sort_values(ascending=False) # Sort the counts in descending order

print(sorted_null_counts) # Print the sorted values

US              293
Italy           117
Germany          27
France            5
South Africa      1
Australia         1
Romania           1
Mexico            1
Chile             1
Slovenia          1
Name: country, dtype: int64


<font size="3" >**Now as we have filtered number of null user_name as per countries, lets impute the null entries of user_name with most frequent country corresponding to that. For eg - the most frequent user_name for 'US' is '@vboone', so we will fill all the null values in user_name table with the most common country** <font>

In [1023]:
us_data = df[df['country'] == 'US'] # Filter the rows where the 'country' column is 'US'
most_frequent_entry = us_data['user_name'].mode()[0] # Find the most frequent entry in a column
print("The most frequent entry in the 'column_name' column for 'US' data is:", most_frequent_entry)

The most frequent entry in the 'column_name' column for 'US' data is: @vboone


<font size="3" >**Similarly we will do for other countries, but I will not show that just the structure is shown below** <font>

In [1024]:
df.loc[df['country'] =='US','user_name'] = df.loc[df['country'] == 'US', 'user_name'].fillna('@vboone')
df.loc[df['country'] =='Italy','user_name'] = df.loc[df['country'] == 'Italy', 'user_name'].fillna('@kerinokeefe')
df.loc[df['country'] =='Germany','user_name'] = df.loc[df['country'] == 'Germany', 'user_name'].fillna('@JoeCz')
df.loc[df['country'] =='France','user_name'] = df.loc[df['country'] == 'France', 'user_name'].fillna('@vossroger')
df.loc[df['country'] =='Australia','user_name'] = df.loc[df['country'] == 'Australia', 'user_name'].fillna('@JoeCz')
df.loc[df['country'] =='New Zealand','user_name'] = df.loc[df['country'] == 'New Zealand', 'user_name'].fillna('@JoeCz')
df.loc[df['country'] =='Chile','user_name'] = df.loc[df['country'] == 'Chile', 'user_name'].fillna('@wineschach')
df.loc[df['country'] =='South Africa','user_name'] = df.loc[df['country'] == 'South Africa', 'user_name'].fillna('@laurbuzz')
df.loc[df['country'] =='Spain','user_name'] = df.loc[df['country'] == 'Spain', 'user_name'].fillna('@wineschach')
df.loc[df['country'] =='Romania','user_name'] = df.loc[df['country'] == 'Romania', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Bulgaria','user_name'] = df.loc[df['country'] == 'Bulgaria', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Hungary','user_name'] = df.loc[df['country'] == 'Hungary', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Croatia','user_name'] = df.loc[df['country'] == 'Croatia', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Canada','user_name'] = df.loc[df['country'] == 'Canada', 'user_name'].fillna('@paulgwine')
df.loc[df['country'] =='Slovenia','user_name'] = df.loc[df['country'] == 'Slovenia', 'user_name'].fillna('@worldwineguys ')
df.loc[df['country'] =='Lebanon','user_name'] = df.loc[df['country'] == 'Lebanon', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Georgia','user_name'] = df.loc[df['country'] == 'Georgia', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Austria','user_name'] = df.loc[df['country'] == 'Austria', 'user_name'].fillna('@AnneInVino')
df.loc[df['country'] =='Mexico','user_name'] = df.loc[df['country'] == 'Mexico', 'user_name'].fillna('@wineschach')
df.loc[df['country'] =='Turkey','user_name'] = df.loc[df['country'] == 'Turkey', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Israel','user_name'] = df.loc[df['country'] == 'Israel', 'user_name'].fillna('@wineschach')
df.loc[df['country'] =='Argentina','user_name'] = df.loc[df['country'] == 'Argentina', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Ukraine','user_name'] = df.loc[df['country'] == 'Ukraine', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Slovakia','user_name'] = df.loc[df['country'] == 'Slovakia', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Czech Republic','user_name'] = df.loc[df['country'] == 'Czech Republic', 'user_name'].fillna('@worldwineguys')
df.loc[df['country'] =='Macedonia','user_name'] = df.loc[df['country'] == 'Macedonia', 'user_name'].fillna('@suskostrzewa')
df.loc[df['country'] =='Portugal','user_name'] = df.loc[df['country'] == 'Portugal', 'user_name'].fillna('@vossroger')

In [1025]:
df.isnull().sum()

user_name               0
country                 0
review_title            0
review_description      0
points                  0
price                 128
province                0
region_1              305
winery                  0
variety                 0
dtype: int64

<font size="3" >**Fill the null values in 'price' column with the average price of the corresponding country** <font>

In [1026]:
country_avg_price = df.groupby('country')['price'].transform('mean')
df['price'].fillna(country_avg_price, inplace=True)
df.head()

Unnamed: 0,user_name,country,review_title,review_description,points,price,province,region_1,winery,variety
0,@kerinokeefe,Italy,Nicosia 2013 Vulkà Bianco (Etna),"Aromas include tropical fruit, broom, brimston...",87,46.838095,Sicily & Sardinia,Etna,Nicosia,White Blend
1,@vossroger,Portugal,Quinta dos Avidagos 2011 Avidagos Red (Douro),"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,,Quinta dos Avidagos,Portuguese Red
2,@paulgwine,US,Rainstorm 2013 Pinot Gris (Willamette Valley),"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm,Pinot Gris
3,@vboone,US,St. Julian 2013 Reserve Late Harvest Riesling ...,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,St. Julian,Riesling
4,@paulgwine,US,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Sweet Cheeks,Pinot Noir


<font size="3" >**Now we have to impute null values in region_1 column. If we look properly at our data, we can see that 'review_title' has entries in which in the '()', we can see the region. It is our best estimate to impute these values in null columns of region_1. So first we will extract these characters and make a new column called 'new_column'. Then we will fill extact values of new_column in region_1 in same index. Then we drop columns 'review_title and 'new_column'** <font>

In [1027]:
import re
df['new_column'] = df['review_title'].str.extract(r'\((.*?)\)')

In [1028]:
df['region_1'].fillna(df['new_column'], inplace=True)

In [1029]:
df=df.drop(['review_title','new_column'],axis=1)

In [1030]:
df.head()

Unnamed: 0,user_name,country,review_description,points,price,province,region_1,winery,variety
0,@kerinokeefe,Italy,"Aromas include tropical fruit, broom, brimston...",87,46.838095,Sicily & Sardinia,Etna,Nicosia,White Blend
1,@vossroger,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,Douro,Quinta dos Avidagos,Portuguese Red
2,@paulgwine,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm,Pinot Gris
3,@vboone,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,St. Julian,Riesling
4,@paulgwine,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Sweet Cheeks,Pinot Noir


In [1031]:
# Let us fill 'not_known' where region_1 is still null even after doing all above procedures.
df['region_1'].fillna('not_known', inplace=True)

<font size="6">**Exploratory Data Analysis**</font>

In [1032]:
df.head()

Unnamed: 0,user_name,country,review_description,points,price,province,region_1,winery,variety
0,@kerinokeefe,Italy,"Aromas include tropical fruit, broom, brimston...",87,46.838095,Sicily & Sardinia,Etna,Nicosia,White Blend
1,@vossroger,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,Douro,Quinta dos Avidagos,Portuguese Red
2,@paulgwine,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm,Pinot Gris
3,@vboone,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,St. Julian,Riesling
4,@paulgwine,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Sweet Cheeks,Pinot Noir


In [1033]:
import plotly.express as px
fig = px.histogram(df, x='variety') # shows the different variety of grapes and their count
fig.show()

In [1034]:
import plotly.express as px
fig = px.histogram(df, x="country") # shows the different countries and their counts
fig.show()

In [1035]:
fig = px.scatter(df, x='country', y='price')
fig.update_layout(title_text='country vs price',
    xaxis_title_text='country',
    yaxis_title_text='price',
    font=dict(size=12))
fig.show()

<font size="3" >**We can see in the above plot that most expensive wines are generally from countries that produces wines the most eg. US, Italy, Germany, Chile, Spain** <font>

In [1036]:
fig = px.scatter(df, x='country', y='variety')

fig.update_layout(title_text='country vs variety',
    xaxis_title_text='country',
    yaxis_title_text='variety',
    font=dict(size=12))
fig.show()

<font size="3" >**We can see in the above plot the variety of grapes and country are not related. That means one cannot predict which type of grapes are from which countries** <font>

In [1037]:
fig = px.scatter(df, x='points', y='price')
fig.update_layout(title_text='price vs points',
    xaxis_title_text='points',
    yaxis_title_text='price',
    font=dict(size=12))
fig.show()

<font size="3" >**We can see in the above plot that the price of the wine gradually increases as the points of that wine is increasing.** <font>

In [1038]:
df.head()

Unnamed: 0,user_name,country,review_description,points,price,province,region_1,winery,variety
0,@kerinokeefe,Italy,"Aromas include tropical fruit, broom, brimston...",87,46.838095,Sicily & Sardinia,Etna,Nicosia,White Blend
1,@vossroger,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,Douro,Quinta dos Avidagos,Portuguese Red
2,@paulgwine,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm,Pinot Gris
3,@vboone,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,St. Julian,Riesling
4,@paulgwine,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Sweet Cheeks,Pinot Noir


<font size="3" >**We will now use Non-Negative Matrix Factorization to perform dimension reduction and clustering of the 'review description' column to cluster the reviews which are same by comapring the words used in them. We can use it in conjuction with TF-IDF to model topics across the review_description model**<font>

In [1039]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [1040]:
tfidf = TfidfVectorizer(max_df = 0.95, min_df = 2, stop_words='english')

**max_df** = maximum document frequency (show words that are not in more than 95 percent of data) 
**min_df** = minimum document frequency (shows atleast in 2 different data)

In [1041]:
dtm = tfidf.fit_transform(df['review_description']) 

In [1042]:
dtm

<2000x2815 sparse matrix of type '<class 'numpy.float64'>'
	with 44012 stored elements in Compressed Sparse Row format>

In [1043]:
from sklearn.decomposition import NMF

In [1044]:
nmf_model = NMF(n_components=7,random_state=42)

 **n_components shows how many different reviews we want, I will chosse this value as 10 but it can vary**

In [1045]:
nmf_model.fit(dtm)


The 'init' value, when 'init=None' and n_components is less than n_samples and n_features, will be changed from 'nndsvd' to 'nndsvda' in 1.1 (renaming of 0.26).



NMF(n_components=7, random_state=42)

In [1046]:
len(tfidf.get_feature_names())


Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.



2815

In [1047]:
len(nmf_model.components_)

7

In [1048]:
len(nmf_model.components_[0])

2815

In [1049]:
single_topic = nmf_model.components_[0]

In [1050]:
# Returns the indices that would sort this array.
single_topic.argsort()

array([1407, 1608, 1607, ..., 1071, 2509, 2751], dtype=int64)

In [1051]:
# Top 10 words for this topic:
single_topic.argsort()[-10:]

array([ 692, 2076, 2763, 2429, 2093,  998, 1074, 1071, 2509, 2751],
      dtype=int64)

In [1052]:
top_word_indices = single_topic.argsort()[-10:]

In [1053]:
for index,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['concentrated', 'drink', 'dense', 'aging', 'black', 'dark', 'rich', 'wood', 'structure', 'ripe', 'firm', 'fruits', 'fruit', 'tannins', 'wine']


THE TOP 15 WORDS FOR TOPIC #1
['attractive', 'ripe', 'texture', 'bright', 'fruits', 'character', 'light', 'red', 'soft', 'crisp', 'ready', 'wine', 'acidity', 'drink', 'fruity']


THE TOP 15 WORDS FOR TOPIC #2
['nose', 'crisp', 'white', 'flavors', 'fruit', 'palate', 'finish', 'peach', 'green', 'lime', 'pear', 'fresh', 'citrus', 'lemon', 'apple']


THE TOP 15 WORDS FOR TOPIC #3
['clove', 'leather', 'delivers', 'spice', 'licorice', 'nose', 'dried', 'offers', 'tannins', 'aromas', 'alongside', 'pepper', 'palate', 'cherry', 'black']


THE TOP 15 WORDS FOR TOPIC #4
['blackberry', 'sangiovese', '10', 'currant', 'malbec', '20', '50', 'petit', 'verdot', 'syrah', 'franc', 'merlot', 'blend', 'sauvignon', 'cabernet']


THE TOP 15 WORDS FOR TOPIC #5
['texture', 'like', 'bodied', 'silky', 'smoky', 'good', 'cola', 'cherries', 's

In [1054]:
dtm.shape

(2000, 2815)

In [1055]:
topic_results = nmf_model.transform(dtm)

In [1056]:
topic_results.shape

(2000, 7)

In [1057]:
topic_results[0] # coefficient value for top topics

array([0.        , 0.        , 0.08655895, 0.09125821, 0.        ,
       0.        , 0.        ])

In [1058]:
topic_results[0].round(2)

array([0.  , 0.  , 0.09, 0.09, 0.  , 0.  , 0.  ])

In [1059]:
topic_results[0].argmax() # to extract index positions

3

In [1060]:
topic_results.argmax(axis=1)

array([3, 1, 2, ..., 6, 6, 2], dtype=int64)

In [1061]:
df['reviews'] = topic_results.argmax(axis=1) # concatenating reviews with our dataframe

<font size="3" >**We have created 10 different articles reviews based on words, so in article column each review is denoted by integer that is related to topic that we saw earlier. So rows with same review might have same word description in their review_description column**<font>

In [1062]:
df.head()

Unnamed: 0,user_name,country,review_description,points,price,province,region_1,winery,variety,reviews
0,@kerinokeefe,Italy,"Aromas include tropical fruit, broom, brimston...",87,46.838095,Sicily & Sardinia,Etna,Nicosia,White Blend,3
1,@vossroger,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,Douro,Quinta dos Avidagos,Portuguese Red,1
2,@paulgwine,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm,Pinot Gris,2
3,@vboone,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,St. Julian,Riesling,2
4,@paulgwine,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Sweet Cheeks,Pinot Noir,6


In [1063]:
df.drop(['review_description'],axis=1)

Unnamed: 0,user_name,country,points,price,province,region_1,winery,variety,reviews
0,@kerinokeefe,Italy,87,46.838095,Sicily & Sardinia,Etna,Nicosia,White Blend,3
1,@vossroger,Portugal,87,15.000000,Douro,Douro,Quinta dos Avidagos,Portuguese Red,1
2,@paulgwine,US,87,14.000000,Oregon,Willamette Valley,Rainstorm,Pinot Gris,2
3,@vboone,US,87,13.000000,Michigan,Lake Michigan Shore,St. Julian,Riesling,2
4,@paulgwine,US,87,65.000000,Oregon,Willamette Valley,Sweet Cheeks,Pinot Noir,6
...,...,...,...,...,...,...,...,...,...
1995,@vboone,US,89,16.000000,New York,Finger Lakes,Billsboro,Riesling,2
1996,@JoeCz,New Zealand,89,14.000000,Marlborough,Marlborough,Brancott,Pinot Grigio,2
1997,@wineschach,Spain,89,30.000000,Northern Spain,Ribera del Duero,Salcis,Tempranillo,6
1998,@wineschach,Argentina,89,20.000000,Mendoza Province,Mendoza,Terrazas de Los Andes,Malbec,6


<font size="6" >**Modelling** <font>

<font size="3" >**Using one-hot encoding for all columns except target variable** <font>

In [1064]:
cat_cols = ['user_name', 'country', 'province','region_1','winery']
for col in cat_cols: # Perform one-hot encoding for each categorical column
    one_hot = pd.get_dummies(df[col], prefix=col)
    data = df.drop(col, axis=1)
    data = pd.concat([df, one_hot], axis=1)

In [1065]:
data.head()

Unnamed: 0,user_name,country,review_description,points,price,province,region_1,winery,variety,reviews,...,winery_Z'IVO,winery_Z. Alexander Brown,winery_Zahel,winery_Zenato,winery_Zerba Cellars,winery_Zolo,winery_Zudugarai,winery_Zull,winery_Ànima Negra,winery_Écluse
0,@kerinokeefe,Italy,"Aromas include tropical fruit, broom, brimston...",87,46.838095,Sicily & Sardinia,Etna,Nicosia,White Blend,3,...,0,0,0,0,0,0,0,0,0,0
1,@vossroger,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,Douro,Quinta dos Avidagos,Portuguese Red,1,...,0,0,0,0,0,0,0,0,0,0
2,@paulgwine,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm,Pinot Gris,2,...,0,0,0,0,0,0,0,0,0,0
3,@vboone,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,St. Julian,Riesling,2,...,0,0,0,0,0,0,0,0,0,0
4,@paulgwine,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Sweet Cheeks,Pinot Noir,6,...,0,0,0,0,0,0,0,0,0,0


<font size="3" >**Using Label encoding for target variable** <font>

In [1066]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(data['variety'])

# Transform the target column using the label encoder
data['variety'] = le.transform(data['variety'])

In [1067]:
data.head()

Unnamed: 0,user_name,country,review_description,points,price,province,region_1,winery,variety,reviews,...,winery_Z'IVO,winery_Z. Alexander Brown,winery_Zahel,winery_Zenato,winery_Zerba Cellars,winery_Zolo,winery_Zudugarai,winery_Zull,winery_Ànima Negra,winery_Écluse
0,@kerinokeefe,Italy,"Aromas include tropical fruit, broom, brimston...",87,46.838095,Sicily & Sardinia,Etna,Nicosia,26,3,...,0,0,0,0,0,0,0,0,0,0
1,@vossroger,Portugal,"This is ripe and fruity, a wine that is smooth...",87,15.0,Douro,Douro,Quinta dos Avidagos,15,1,...,0,0,0,0,0,0,0,0,0,0
2,@paulgwine,US,"Tart and snappy, the flavors of lime flesh and...",87,14.0,Oregon,Willamette Valley,Rainstorm,13,2,...,0,0,0,0,0,0,0,0,0,0
3,@vboone,US,"Pineapple rind, lemon pith and orange blossom ...",87,13.0,Michigan,Lake Michigan Shore,St. Julian,19,2,...,0,0,0,0,0,0,0,0,0,0
4,@paulgwine,US,"Much like the regular bottling from 2012, this...",87,65.0,Oregon,Willamette Valley,Sweet Cheeks,14,6,...,0,0,0,0,0,0,0,0,0,0


In [1068]:
df_new = data.drop(['user_name','country','review_description','points','price','province','region_1','winery'],axis=1)

In [1069]:
df_new.head()

Unnamed: 0,variety,reviews,winery_14 Hands,winery_2Plank,winery_2nd Chance,winery_42°S,winery_Abbazia Santa Anastasia,winery_Acacia,winery_Acrobat,winery_Adega Cooperativa de Borba,...,winery_Z'IVO,winery_Z. Alexander Brown,winery_Zahel,winery_Zenato,winery_Zerba Cellars,winery_Zolo,winery_Zudugarai,winery_Zull,winery_Ànima Negra,winery_Écluse
0,26,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,15,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,13,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,19,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,14,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [1070]:
df_new.dtypes

variety               int32
reviews               int64
winery_14 Hands       uint8
winery_2Plank         uint8
winery_2nd Chance     uint8
                      ...  
winery_Zolo           uint8
winery_Zudugarai      uint8
winery_Zull           uint8
winery_Ànima Negra    uint8
winery_Écluse         uint8
Length: 1617, dtype: object

In [1071]:
X = df_new.drop('variety',axis=1)
y = df_new['variety']

In [1072]:
from sklearn.model_selection import train_test_split

In [1073]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=101)

<font size="5" >**Using Random Forest** <font>

In [1074]:
from sklearn.ensemble import RandomForestClassifier

In [1075]:
rfc = RandomForestClassifier(n_estimators=600)

In [1076]:
rfc.fit(X_train,y_train)

RandomForestClassifier(n_estimators=600)

In [1077]:
predictions = rfc.predict(X_test)

In [1078]:
from sklearn.metrics import classification_report,confusion_matrix

In [1079]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.41      0.50      0.45        46
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00         4
           3       0.43      0.05      0.09        63
           4       1.00      0.12      0.22         8
           5       0.41      0.74      0.52        68
           6       1.00      0.27      0.43        11
           7       0.00      0.00      0.00         7
           8       0.00      0.00      0.00         6
           9       0.75      0.17      0.27        18
          10       1.00      0.06      0.11        18
          11       0.22      0.81      0.34        21
          12       0.00      0.00      0.00         6
          13       0.33      0.17      0.22        12
          14       0.36      0.56      0.44        64
          15       0.50      0.07      0.12        15
          16       0.00      0.00      0.00         8
          17       0.24    


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



In [None]:
from sklearn import metrics
metrics.f1_score(y_test, predictions, average='weighted', labels=np.unique(predictions))

<font size="6" >**Using Decision Tree** <font>

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dtree = DecisionTreeClassifier()

In [None]:
dtree.fit(X_train,y_train)

In [None]:
pred = dtree.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
print(classification_report(y_test,pred))

In [None]:
from sklearn import metrics
metrics.f1_score(y_test, pred, average='weighted', labels=np.unique(pred))