In [2]:
import pandas as pd
import sys
import requests
import json
from pprint import pprint


#importing the google api key from a folder config
sys.path.append('../..')
from config import google_api_key

In [3]:
#importing the original CSV file

original_csv_file = pd.read_csv("winedata.csv")
original_csv_file.head(1)

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia


In [4]:
#Checking the number of wine reviews from the original CSV file

print(f'Total number of wine review is: {len(original_csv_file["title"])}.')

Total number of wine review is: 129971.


In [5]:
#parsing out a year from the title & adding a new column consisting of years. 
##Refer back to "parsing_year" Jupyter Notebook file to see the vertification for this process.

title_list = [title for title in original_csv_file["title"]]

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString) 

year_list = []

for title in title_list:
    title_strings_list = []
    title_numbers_list = []
    year_numbers_list = []
    for x in title.split():
        title_strings_list.append(x)
    if hasNumbers(title_strings_list) == True:
        for y in title_strings_list:
            if y.isdigit():
                title_numbers_list.append(int(y))
        year_numbers_list = [i for i in title_numbers_list if 1970 < i < 2019]
        if len(year_numbers_list) >= 1:
            year_list.append(year_numbers_list[0])
        elif len(year_numbers_list) == 0:
            year_list.append("N/A")
    if hasNumbers(title_strings_list) == False:
        year_list.append("N/A")

year_list[102843] = 2000 #this one Title is an exception, refer back to "parsing_year" file for more info    

In [6]:
#adding the new list to the existing DataFrame. 

edited_csv_file = original_csv_file
edited_csv_file["year"] = year_list
print(f'Total number of wine review is: {len(edited_csv_file["title"])}.')
edited_csv_file.head(1)


Total number of wine review is: 129971.


Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,year
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,2013


In [6]:
#cleaning data

#1. Renaming the columns
df = edited_csv_file.rename(columns={"Unnamed: 0": "Index", "country": "Country", "description":"Description",
                                       "designation": "Designation", "points":"Grade", "province": "Province",
                                        "region_1": "Region",
                                        "taster_name": "Taster", "taster_twitter_handle": "Taster_Twitter",
                                        "title": "Title", "variety": "Variety", "winery": "Winery", "year":"Year",
                                        "price": "Price"
                                       })
#2. Dropping designation, and region_2
df = df[["Index", "Title", "Variety", "Year", "Price", "Grade", "Winery", "Province", "Region", "Country", 
         "Taster", "Taster_Twitter", "Description"]]


#3. Removing all wines that do not have year assigned
df = df.loc[df["Year"]!= "N/A"]


#4. Removing all wines that do not have Price tags
df = df.fillna("N/A")
df = df.loc[df["Price"] != "N/A"]


#4. Removing Duplicates, if the same Taster reviewed the same wine more than once
df_duplicates_dropped = df.drop_duplicates(subset=["Title", "Price", "Grade", "Taster", "Description"], keep='first', inplace=False)

#Summary
print(f'Total Number of Wine Reviews for original data frame is: {len(original_csv_file["description"])}.')
print(f'Total Number of Wine Reviews for dataframe with empty Price and year removed is: {len(df["Index"])}.')
print(f'Total Number of Wine Reviews for dataframe after duplicates removed is: {len(df_duplicates_dropped["Index"])}.')

Total Number of Wine Reviews for original data frame is: 129971.
Total Number of Wine Reviews for dataframe with empty Price and year removed is: 116802.
Total Number of Wine Reviews for dataframe after duplicates removed is: 107734.


In [7]:
#Helpful function that can quickly summarize the new dataframe. 

def summarize(dataframe):
    return pd.DataFrame({"Total Number of Wine Reviews" : [len(dataframe['Title'].unique())],
                "Total Number of Wineries": [len(dataframe['Winery'].unique())],
                "Total Number of Provinces": [len(dataframe['Province'].unique())],
                "Total Number of Countries": [len(dataframe['Country'].unique())],
                "Total Number of Years" : [len(dataframe['Year'].unique())],
                "Total Number of Tasters": [len(dataframe['Taster'].unique())]})


In [8]:
#Renaming the DataFrame & Dropping the index column

new_df = df_duplicates_dropped
new_df = new_df.drop(columns=["Index"])
summarize(new_df)

Unnamed: 0,Total Number of Wine Reviews,Total Number of Wineries,Total Number of Provinces,Total Number of Countries,Total Number of Years,Total Number of Tasters
0,107379,15475,416,43,41,20


In [9]:
#Compiling the list of Countries. 

list_of_countries = new_df["Country"].value_counts()

list_of_countries

US                        49371
France                    15018
Italy                     14742
Spain                      5582
Portugal                   4273
Chile                      4106
Argentina                  3462
Austria                    2506
Australia                  2102
Germany                    1937
New Zealand                1232
South Africa               1180
Israel                      452
Greece                      406
Canada                      223
Bulgaria                    132
Hungary                     126
Romania                     102
Uruguay                      97
Turkey                       81
Georgia                      72
Slovenia                     69
Mexico                       68
Croatia                      67
N/A                          55
Moldova                      54
England                      47
Brazil                       35
Lebanon                      32
Morocco                      24
Peru                         16
Macedoni

In [10]:
#Sorting countries that have less than 1,000 wine reviews in the dataset

low_count_countries = []
    
for i, v in list_of_countries.items():
    if v < 1000:
        low_count_countries.append(i)
        
print(low_count_countries)

['Israel', 'Greece', 'Canada', 'Bulgaria', 'Hungary', 'Romania', 'Uruguay', 'Turkey', 'Georgia', 'Slovenia', 'Mexico', 'Croatia', 'N/A', 'Moldova', 'England', 'Brazil', 'Lebanon', 'Morocco', 'Peru', 'Macedonia', 'Czech Republic', 'Cyprus', 'India', 'Serbia', 'Switzerland', 'Ukraine', 'Luxembourg', 'Armenia', 'Bosnia and Herzegovina', 'China', 'Slovakia']


In [11]:
#Dropping rows from the bottom countries from the dataframe

for i in range(0, len(low_count_countries)):
    new_df = new_df.loc[new_df["Country"]!= low_count_countries[i]]

summarize(new_df)

Unnamed: 0,Total Number of Wine Reviews,Total Number of Wineries,Total Number of Provinces,Total Number of Countries,Total Number of Years,Total Number of Tasters
0,105159,15000,240,12,41,18


In [12]:
#Verifying the list of remaining countries

list_of_countries = new_df["Country"].value_counts()

remaining_countries = []
    
for i, v in list_of_countries.items():
    remaining_countries.append(i)
        
print(remaining_countries)

['US', 'France', 'Italy', 'Spain', 'Portugal', 'Chile', 'Argentina', 'Austria', 'Australia', 'Germany', 'New Zealand', 'South Africa']


In [13]:
#Compiling the list of Countries. 

list_of_years = new_df["Year"].value_counts()

list_of_years

2013    13434
2012    13152
2014    13120
2011    10266
2010    10093
2015     8382
2009     8314
2008     6228
2007     6077
2006     4892
2005     3131
2016     3124
2004     1512
2000      722
2001      653
1999      608
1998      533
2003      464
2002      318
1997      294
1996       59
1995       45
1994       23
1992       14
2017        9
1989        6
1990        5
1988        5
1985        4
1991        4
1986        3
1993        3
1987        2
1983        2
1978        2
1980        2
1984        2
1982        1
1973        1
1974        1
1976        1
Name: Year, dtype: int64

In [14]:
#Sorting Years that have less than 1,500 wine reviews in the dataset

low_count_years = []
    
for i, v in list_of_years.items():
    if v < 1500:
        low_count_years.append(i)

low_count_years.sort(reverse=True)
print(low_count_years)

[2017, 2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993, 1992, 1991, 1990, 1989, 1988, 1987, 1986, 1985, 1984, 1983, 1982, 1980, 1978, 1976, 1974, 1973]


In [15]:
#Dropping rows from years that have less than 1500 wine reviews from the dataframe

for i in range(0, len(low_count_years)):
    new_df = new_df.loc[new_df["Year"]!= low_count_years[i]]

summarize(new_df)

Unnamed: 0,Total Number of Wine Reviews,Total Number of Wineries,Total Number of Provinces,Total Number of Countries,Total Number of Years,Total Number of Tasters
0,101379,14503,232,12,13,18


In [16]:
#Verifying the list of remaining Years

list_of_years = new_df["Year"].value_counts()

remaining_years = []
    
for i, v in list_of_years.items():
    remaining_years.append(i)
        
remaining_years.sort(reverse=True)
print(remaining_years)

[2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004]


In [17]:
#exporting to the CSV file

new_df.to_csv("winedata_edited.csv")