# Exploratory Data Analysis (EDA)

In [2]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('WhiskeyScrappedData.csv')

In [6]:
df

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70cl,,3% ABV,£59.95
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70cl,,3% ABV,£79.95
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70cl,,45% ABV,£31.95
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75cl,,47% ABV,£43.95
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70cl,,8% ABV,£124.95
...,...,...,...,...,...,...,...,...
522,522,Teeling,Silver Reserve Irish,Irish,70cl,21 Year Old\n·\n1991\nVintage,46% ABV,£749.95
523,523,Teeling,Small Batch Irish,Irish,70cl,,46% ABV,
524,524,Teeling,Pineapple Rum Cask #2 - Small Batch Collaborat...,,70cl,,2% ABV,
525,525,Teeling,Ginger Beer - Small Batch Collaboration - 2022...,,70cl,,46% ABV,


#### `Replacing empty space with NAS`
- No Age Statement (NAS)
- Because some brands does not disclose age of distillery
### No Age Statement (NAS) whiskies are more common now because:

- They let distillers blend younger and older stock for consistency.

- They offer flexibility and avoid limiting perception by age alone.

In [9]:
replace0 = lambda x : 'NAS' if x == '' else x

In [11]:
df['Year'] = df['Year'].apply(replace0).head(50)
df['Year'] = df['Year'].fillna('NAS')

In [15]:
#Rechecking 
print(df['Year'].isna().sum())

0


### Removing Special characters and alphabet from ABV_Percent column and converting it to integer

In [18]:
df['ABV_Percent'] = df['ABV_Percent'].replace('\D' , '' , regex = True).astype(int)

In [20]:
df

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70cl,NAS,3,£59.95
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70cl,NAS,3,£79.95
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70cl,NAS,45,£31.95
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75cl,NAS,47,£43.95
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70cl,NAS,8,£124.95
...,...,...,...,...,...,...,...,...
522,522,Teeling,Silver Reserve Irish,Irish,70cl,NAS,46,£749.95
523,523,Teeling,Small Batch Irish,Irish,70cl,NAS,46,
524,524,Teeling,Pineapple Rum Cask #2 - Small Batch Collaborat...,,70cl,NAS,2,
525,525,Teeling,Ginger Beer - Small Batch Collaboration - 2022...,,70cl,NAS,46,


#### Replacing Pound symbol with empty space and converting it to float

In [23]:
df['Price_Pounds']= df['Price'].replace('[£,]' , '' , regex = True).astype(float)

In [25]:
df.head()

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price,Price_Pounds
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70cl,NAS,3,£59.95,59.95
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70cl,NAS,3,£79.95,79.95
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70cl,NAS,45,£31.95,31.95
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75cl,NAS,47,£43.95,43.95
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70cl,NAS,8,£124.95,124.95


### Creating a new feature Price_Rupees

In [28]:
df['Price_Rupees'] = df['Price_Pounds'] * 113.60

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price,Price_Pounds,Price_Rupees
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70cl,NAS,3,£59.95,59.95,6810.32
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70cl,NAS,3,£79.95,79.95,9082.32
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70cl,NAS,45,£31.95,31.95,3629.52
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75cl,NAS,47,£43.95,43.95,4992.72
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70cl,NAS,8,£124.95,124.95,14194.32


### Creating a 'ABV_Category' Column

Low: ABV < 40

Medium: 40 ≤ ABV ≤ 47

High: ABV > 47

In [36]:
def abv_category(x):
    if x < 40:
        return 'Low'
    elif x > 40 and x <47:
        return 'Medium'
    else:
        return 'High'

In [38]:
df['ABV_Category'] = df['ABV_Percent'].apply(abv_category)

In [40]:
df.head()

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price,Price_Pounds,Price_Rupees,ABV_Category
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70cl,NAS,3,£59.95,59.95,6810.32,Low
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70cl,NAS,3,£79.95,79.95,9082.32,Low
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70cl,NAS,45,£31.95,31.95,3629.52,Medium
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75cl,NAS,47,£43.95,43.95,4992.72,High
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70cl,NAS,8,£124.95,124.95,14194.32,Low


### Filling the null values with Forward Propogation

In [44]:
df['Type'].fillna(method = 'ffill' , inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Type'].fillna(method = 'ffill' , inplace = True)
  df['Type'].fillna(method = 'ffill' , inplace = True)


In [46]:
df.head(50)

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price,Price_Pounds,Price_Rupees,ABV_Category
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70cl,NAS,3,£59.95,59.95,6810.32,Low
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70cl,NAS,3,£79.95,79.95,9082.32,Low
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70cl,NAS,45,£31.95,31.95,3629.52,Medium
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75cl,NAS,47,£43.95,43.95,4992.72,High
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70cl,NAS,8,£124.95,124.95,14194.32,Low
5,5,Maker's Mark,Kentucky Straight Bourbon,Kentucky,70cl,NAS,45,£35.95,35.95,4083.92,Medium
6,6,Ben Holladay,Soft Red Wheat Bottled In Bond Straight Bourbon,Bourbon,70cl,NAS,50,£79.95,79.95,9082.32,High
7,7,Booker's,2024-01 Batch Bourbon,Bourbon,70cl,7 Year Old,2,£89.95,89.95,10218.32,Low
8,8,Wild Turkey,Rare Breed Kentucky Straight Bourbon,Kentucky,70cl,NAS,4,£59.95,59.95,6810.32,Low
9,9,Wild Turkey,81 Kentucky Straight Bourbon,Kentucky,70cl,NAS,5,£29.95,29.95,3402.32,Low


### Dropping the Price Column

In [49]:
df.drop('Price' , inplace = True , axis = 1)

In [51]:
df.head()

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price_Pounds,Price_Rupees,ABV_Category
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70cl,NAS,3,59.95,6810.32,Low
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70cl,NAS,3,79.95,9082.32,Low
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70cl,NAS,45,31.95,3629.52,Medium
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75cl,NAS,47,43.95,4992.72,High
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70cl,NAS,8,124.95,14194.32,Low


### Checking Missing Values

In [53]:
df.isna().sum()

Unnamed: 0       0
Brand            0
Varieties        0
Type             0
Centi_liter      0
Year             0
ABV_Percent      0
Price_Pounds    48
Price_Rupees    48
ABV_Category     0
dtype: int64

In [55]:
df.describe()

Unnamed: 0.1,Unnamed: 0,ABV_Percent,Price_Pounds,Price_Rupees
count,527.0,527.0,479.0,479.0
mean,263.0,34.244782,526.715553,59834.89
std,152.276065,18.378127,3130.80241,355659.2
min,0.0,0.0,3.95,448.72
25%,131.5,9.0,43.95,4992.72
50%,263.0,43.0,64.95,7378.32
75%,394.5,46.0,139.95,15898.32
max,526.0,95.0,47999.95,5452794.0


### Replacing null values with median value in 'Price_Rupees' Column

In [59]:
df['Price_Rupees'].fillna(df['Price_Rupees'].median() , inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Price_Rupees'].fillna(df['Price_Rupees'].median() , inplace = True)


In [61]:
## Rechecking
df[df['Price_Rupees'].isna()]

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price_Pounds,Price_Rupees,ABV_Category


### Dropping 'Price_Pounds' Column

In [64]:
df.drop('Price_Pounds' , axis = 1 , inplace = True)

In [66]:
df.head()

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price_Rupees,ABV_Category
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70cl,NAS,3,6810.32,Low
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70cl,NAS,3,9082.32,Low
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70cl,NAS,45,3629.52,Medium
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75cl,NAS,47,4992.72,High
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70cl,NAS,8,14194.32,Low


In [68]:
## Rechecking
df.isna().sum()

Unnamed: 0      0
Brand           0
Varieties       0
Type            0
Centi_liter     0
Year            0
ABV_Percent     0
Price_Rupees    0
ABV_Category    0
dtype: int64

### Converting float to integer to Price_Rupees column

In [71]:
df['Price_Rupees'] = round(df['Price_Rupees'] , 0).astype(int)

In [73]:
df.head()

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price_Rupees,ABV_Category
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70cl,NAS,3,6810,Low
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70cl,NAS,3,9082,Low
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70cl,NAS,45,3630,Medium
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75cl,NAS,47,4993,High
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70cl,NAS,8,14194,Low


### Removing cl from Centi_liter column

In [76]:
df['Centi_liter'] = df['Centi_liter'].replace('\D' , '' , regex = True).astype(int)

In [78]:
df.head()

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price_Rupees,ABV_Category
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70,NAS,3,6810,Low
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70,NAS,3,9082,Low
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70,NAS,45,3630,Medium
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75,NAS,47,4993,High
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70,NAS,8,14194,Low


### Creating a new column 'Mili_liter'

In [81]:
df['Mili_liter'] = df['Centi_liter'] * 10

In [83]:
df.head()

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price_Rupees,ABV_Category,Mili_liter
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70,NAS,3,6810,Low,700
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70,NAS,3,9082,Low,700
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70,NAS,45,3630,Medium,700
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75,NAS,47,4993,High,750
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70,NAS,8,14194,Low,700


In [85]:
## Rechecking
df.isna().sum()

Unnamed: 0      0
Brand           0
Varieties       0
Type            0
Centi_liter     0
Year            0
ABV_Percent     0
Price_Rupees    0
ABV_Category    0
Mili_liter      0
dtype: int64

### Checking No of Duplicates

In [90]:
print(df.duplicated().sum())

0


In [98]:
df

Unnamed: 0.1,Unnamed: 0,Brand,Varieties,Type,Centi_liter,Year,ABV_Percent,Price_Rupees,ABV_Category,Mili_liter
0,0,Angel's Envy,Straight Port Cask Finish Bourbon,Bourbon,70,NAS,3,6810,Low,700
1,1,Angel's Envy,Manhattan Bundle - Martini Rubino Vermouth & S...,Bourbon,70,NAS,3,9082,Low,700
2,2,Buffalo Trace,Traveller Bourbon Whiskey (Chris,Bourbon,70,NAS,45,3630,Medium,700
3,3,Buffalo Trace,Kosher Wheat Recipe Bourbon,Bourbon,75,NAS,47,4993,High,750
4,4,Elijah Craig,Barrel Proof Kentucky Straight Bourbon,Barrel,70,NAS,8,14194,Low,700
...,...,...,...,...,...,...,...,...,...,...
522,522,Teeling,Silver Reserve Irish,Irish,70,NAS,46,85194,Medium,700
523,523,Teeling,Small Batch Irish,Irish,70,NAS,46,7378,Medium,700
524,524,Teeling,Pineapple Rum Cask #2 - Small Batch Collaborat...,Irish,70,NAS,2,7378,Low,700
525,525,Teeling,Ginger Beer - Small Batch Collaboration - 2022...,Irish,70,NAS,46,7378,Medium,700


In [None]:
df.to_csv('WhiskeyCleanedData.csv')