In [None]:
import numpy as np
import pandas as pd
df=pd.read_csv("../input/wine-reviews-smallcsv/wine reviews_small.csv")
pd.options.display.max_rows = 50000
df.head()

In [None]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [None]:
df.head()

In [None]:
print(df.columns)

In [None]:
df.shape

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.heatmap(df.isna())

In [None]:
df.isna().sum()

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df['country'].unique()

## Droping missing values in country and province columns, where missing values are <20 

In [None]:
df.dropna(subset=['country'],inplace=True)
df.dropna(subset=['province'],inplace=True)

In [None]:
df.shape

## Replacing missing values with mean and mode

In [None]:
df['designation'].fillna(df['designation'].mode()[0],inplace=True)
df['designation'].fillna(df['designation'].mode()[0],inplace=True)
df['price'].fillna(df['price'].mean(),inplace=True)
df['region_1'].fillna(df['region_1'].mode()[0],inplace=True)
df['region_2'].fillna(df['region_2'].mode()[0],inplace=True)
df['taster_name'].fillna(df['taster_name'].mode()[0],inplace=True)
df['taster_twitter_handle'].fillna(df['taster_twitter_handle'].mode()[0],inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.head()

## world map with countries with it's wine variety 

In [None]:
import plotly.express as px
fig = px.scatter_geo(df, locations="country", color="country",locationmode = 'country names',
                     hover_name="country", animation_frame="price", 
                     hover_data=['points','price','province','title','variety','winery'],
                     size="price",
                     projection="natural earth")
fig.show()

## wine variety and it's price

In [None]:
fig = px.bar(df, x='variety', y='price',
             hover_name='country', color='country',width=10000, height=800,
             title='wine variety and its price'
              )
fig.update_xaxes(tickangle=90, type='category',showticklabels= True,tickmode='linear',
                 tickfont=dict(family='Rockwell', color='crimson', size=14))
fig.update_traces(
                  marker_line_width = 0,
                  selector=dict(type="bar"))

fig.update_layout(bargap=0,
                  bargroupgap = 0
                  
                 )
fig.show()

In [None]:
df.columns

In [None]:
import plotly.express as px

fig = px.scatter(df,x='points',y='price',width=1000,
                  color='country',size_max=30,
                 hover_name='variety',size='price',
                 hover_data=['designation','title','taster_name']
                 
                )

fig.update_traces(textposition='top center')


fig.update_layout(bargap=0,
                  bargroupgap = 0,
                  hoverlabel=dict(font=dict(family='sans-serif', size=10)),
                  autosize=True
                  
                 )
                  
                  
                 
fig.show()

In [None]:
df.head(1)

## Top 10 wine countries

In [None]:
# Top 10 wine countries 
country = df.groupby('country').size().reset_index(name='count').sort_values('count', ascending=False)
px.bar(country.head(10), x='country', y='count', template='simple_white')

We can now know that almost all the wines in the dataset come from US and Europe (specially France, Italy and Spain). The 3rd zone is South America with Chile and Argentina.

## World map 

In [None]:
# World map 
px.choropleth(country, locations = 'country', locationmode='country names', color = 'count', template = 'simple_white',range_color=[2000,15000], color_continuous_scale='Viridis')

## Price distribution, you can move the bar below to zoom in for some price ranges

In [None]:
# Price distribution, you can move the bar below to zoom in for some price ranges
fig = px.histogram(df, x="price", nbins = 250, template='simple_white')
fig.update_layout(xaxis=dict(rangeslider=dict(visible=True), type="linear"))

## Price distribution of top 10 countries

As you can see while all the wines despite their origins start around 5 ,only US and French wines have an actual bottle of more than 2,000

In [None]:
df.groupby('country').price.agg(['count', 'min', 'max', 'mean']).reset_index().sort_values('count',ascending=False).head(10)

In [None]:
px.scatter(df[df['price']>30],x='price',y='points',hover_name='country',hover_data=['variety','winery'],template='simple_white')

## French provinces

In [None]:
france = df[df['country'] == 'France'].groupby('province').size().reset_index(name='count').sort_values('count',ascending=False)
px.pie(france, names='province',values='count')

Although Bordeaux and Burgundy are the most popular province for wine, France has quite a list of good wines through the country.

## US provinces

In [None]:
us = df[df['country'] == 'US'].groupby('province').size().reset_index(name='count').sort_values('count',ascending=False)
px.pie(us,names='province',values='count',height=700,width=800)

California is the place-to-be if you want to produce wine in US with over 65%

## India provinces

In [None]:
india = df[df['country'] == 'India'].groupby('province').size().reset_index(name='count')

In [None]:
india

Nashik is the most famous province for wine in India

## Variety and it's average price

In [None]:
variety = df.groupby('variety')['price'].mean().to_frame().reset_index().sort_values('price',ascending =False)

In [None]:
variety

## Top 10 most expensive wine varieties based on their average price

In [None]:
px.bar(variety.head(10) ,x='variety',y= 'price',)

## Cheapest wine variety  based on their average price

In [None]:
px.bar(variety.tail(10) ,x='variety',y='price')