# Importing The Packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
import plotly
plotly.offline.init_notebook_mode (connected = True)

# Having First Look At The Data

In [None]:
data=pd.read_csv('../input/wine-reviews/winemag-data-130k-v2.csv')
df=pd.read_csv('../input/latitude-and-longitude-for-every-country-and-state/world_country_and_usa_states_latitude_and_longitude_values.csv')

In [None]:
data.head()

# Country Distribution For The Wines

In [None]:
plt.figure(figsize=(18,8))
sns.countplot(data['country'])
plt.xticks(rotation=90)
plt.show()

### Most of the wines are being made in US 

# Top 10 countries with the best wine (based on points)

In [None]:
# Making a function to get the average of the points of wines of a particular country
country_score=pd.DataFrame(columns=['country','points'])
def points():
    score=[]
    for i in set(data.country):
        score.append(data[data['country']==i]['points'].mean())
    return score
score=points()
country_score['country']=list(set(data.country))
country_score['points']=score
country_score.dropna(inplace=True)
country_score=country_score.sort_values('points',ascending=False)

location=[]
for i in country_score['country'].values:
    if i=='England':

        location.append([52.3555,1.1743])
    else:
        ll=df[df['country']==i]['longitude'].values
        lll=df[df['country']==i]['latitude'].values
        if len(ll)==0 or len(lll)==0:
            location.append([np.nan,np.nan])
        else:
            location.append([ll[0],lll[0]])
        
        
country_score['location']=location

In [None]:
country_score.head()

In [None]:
import plotly as py
import plotly.graph_objs as go
datas = dict (
    type = 'choropleth',
    locations = list(country_score.country),
    locationmode='country names',
    z=list(country_score.points))

In [None]:
map = go.Figure(data=[datas])
map

In [None]:
plt.figure(figsize=(18,8))
sns.barplot(data=country_score,x='country',y='points')
plt.xticks(rotation=90)
plt.show()

Best Wines are produced by england followed by india and austria

# Making a Recommendations for Wine

## Making A Function To Tokenize Description And Removing Stopwords

In [None]:
data['description']=data['description'].fillna('')

In [None]:
# This function is to remove stopwords from a particular column and to tokenize it
def rem_stopwords_tokenize(data,name):
      
    def getting(sen):
        example_sent = sen

        stop_words = set(stopwords.words('english')) 

        word_tokens = word_tokenize(example_sent) 

        filtered_sentence = [w for w in word_tokens if not w in stop_words] 

        filtered_sentence = [] 

        for w in word_tokens: 
            if w not in stop_words: 
                filtered_sentence.append(w) 
        return filtered_sentence
    x=[]
    for i in data[name].values:
        x.append(getting(i))
    data[name]=x

In [None]:
rem_stopwords_tokenize(data,'description')

In [None]:
# Making a function to lemmatize all the words
lemmatizer = WordNetLemmatizer() 
def lemmatize_all(data,name):
    arr=data[name]
    a=[]
    for i in arr:
        b=[]
        for j in i:
            x=lemmatizer.lemmatize(j,pos='a')
            x=lemmatizer.lemmatize(x)
            b.append(x)
        a.append(b)
    data[name]=a

In [None]:
lemmatize_all(data,'description')

# Having a look at the updated data

In [None]:
data.head()

Ok now the data is lemmatized and tokenized hence sorted

# Making a function for set recommendation

In [None]:
def set_rec_finder(na,number):
    def intersection(lst1, lst2): 
        return list(set(lst1) & set(lst2)) 
    def find_rec(name):
        x=list(data[data['title']==name]['description'].values)[0]
        score=[]
        for i in range(len(data)):
            score.append([len(intersection(x,data['description'].values[i])),data['title'].values[i]])
        return score
    recommendations=find_rec(na)
    recommendations.sort(reverse=True)
    recommendations=np.array(recommendations)
    ans=recommendations[:number,1]
    for i in ans:
        print(i)

In [None]:
# Let's use the recommendations :)
# Ok i like Rainstorm 2013 Pinot Gris (Willamette Valley) let's find wines like it :)
set_rec_finder('Rainstorm 2013 Pinot Gris (Willamette Valley)',10)

# These recommendations were on the bases of relevance

# Let's find some recommendations on the basis of Ratings :)

So we will get the top 20 recommendations and then sort them on the basis of their popularity by points

In [None]:
def set_rec_finder_by_popularity(na,number):
    def intersection(lst1, lst2): 
        return list(set(lst1) & set(lst2)) 
    def find_rec(name):
        x=list(data[data['title']==name]['description'].values)[0]
        score=[]
        for i in range(len(data)):
            score.append([len(intersection(x,data['description'].values[i])),data['title'].values[i]])
        return score
    recommendations=find_rec(na)
    recommendations.sort(reverse=True)
    recommendations=np.array(recommendations)
    ans=recommendations[:number,1]
    ans2=[]
    for i in ans:
        ans2.append([data[data['title']==i]['points'].values[0],i])
    ans2.sort(reverse=True)
    ans2=np.array(ans2)
    ans2=ans2[:10,1]
    for i in ans2:
        print(i)
        

In [None]:
x=set_rec_finder_by_popularity('Rainstorm 2013 Pinot Gris (Willamette Valley)',20)
    

# Yummmm !!! Wanna taste them all rn 

# Haha if you liked the file don't forget to upvote :)