In [None]:
#import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import csv
import scipy.stats as st

In [None]:
# import .csv
csv_path = "wine_data.csv"
wine_df = pd.read_csv(csv_path, low_memory=False)
wine_df

In [None]:
variety_path = "variety_data.csv"
variety_df = pd.read_csv(variety_path, low_memory=False)
variety_df

In [None]:
# jeanaej's code starts here

## Find the Best Wine

In [None]:
print(f'Points min: {wine_df["Points"].min()}, median: {wine_df["Points"].median()}, mean : {wine_df["Points"].mean()}, max : {wine_df["Points"].max()}')
print(f'Points min: {wine_df["Price"].min()}, median: {wine_df["Price"].median()}, mean : {wine_df["Price"].mean()}, max : {wine_df["Price"].max()}')

In [None]:
# Box plot of All Wine Rating points to see distribution
fig1, ax1 = plt.subplots()
ax1.set_title('Rating Points')
ax1.boxplot(wine_df["Points"])
plt.show()

In [None]:
# Box plot of All Wine Price to see distribution
fig1, ax1 = plt.subplots()
ax1.set_title('Price')
ax1.boxplot(wine_df["Price"])
plt.show()

## By Country

In [None]:
# Get the countries
countries = wine_df["Country"].unique()
countries = sorted(countries)
countries

In [None]:
# Grouping by country
country_group = wine_df.groupby(["Country"])

# Getting the mean of points because the distribution is close to a normal distribution
points_c_mean = country_group["Points"].mean()

# Getting the median price because the distribution is skewed to the right
price_c_median = country_group["Price"].median()

In [None]:
# Creating a bar graph Average rating by country
plt.bar(countries, points_c_mean, align = "center", color = "gold", edgecolor ="black")

# Labeling graphs
plt.title("Average Rating Points by Country")
plt.xlabel("Countries")
plt.ylabel("Rating Point")
plt.xticks(rotation = 90)

# Making the y-values start at 85 to 90 because the range of the rating points began at 85. 
# Note: There is a payment threshold to get a wine rated; therefore, the graph's rating points begin at 85.
plt.ylim(85, 90)
plt.show()

Wine Rating Points are given to wines that requested to get rated. Rating points range from 80 to 100 with a normal distribution. Using the average rating point per country a wine is produced Austria has the highest average rating. 

In [None]:
# Creating a bar graph Median Price by country
plt.bar(countries, price_c_median, align = "center", color = "green", edgecolor ="black")

# Labeling the graph
plt.title("Median Price by Country")
plt.xlabel("Countries")
plt.ylabel("Price ($)")
plt.xticks(rotation = 90)
plt.show()

Wine Prices are skewed to the right, leading to outliers; therefore, the median would better represent the majority of the prices. Using the median the US has the highest median price and Chile has the lowest median price.

Finding the best wine based on taste, one should pick a wine from Austria.

## By Variety

In [None]:
# Getting the varieties
varieties = wine_df["Variety"].unique()
varieties = sorted(varieties)
varieties

In [None]:
# Grouping by varieties
wine_variety_group = wine_df.groupby(["Variety"])

# Getting the mean of points because the distribution is close to a normal distribution
points_wv_mean = wine_variety_group["Points"].mean()

# Getting the median of price because the distribution is skewed to the right
price_wv_median = wine_variety_group["Price"].median()

In [None]:
# Creating a bar graph Average points by varietal
plt.bar(varieties, points_wv_mean, align = "center")

# Labeling the graph
plt.title("Average Rating Points by Variety")
plt.xlabel("Varieties")
plt.ylabel("Rating Points")
plt.xticks(rotation = 90)

# Making the y-values start at 85 to 90 because the range of the rating points began at 85. 
# Note: There is a payment threshold to get a wine rated; therefore, the rating points begin at 85.
plt.ylim(80,100)
plt.show()

There are 545 different varieties within the cleaned wine data set. In order to further analyze the price and rating points of wine varieties, we created another data set with the 25 top ranked varieties.

In [None]:
# Getting the Top 25 Variety names
top_varieties = variety_df["Variety"].unique()
top_varieties = sorted(top_varieties)
top_varieties

In [None]:
# Grouping by varieties
varieties_group = variety_df.groupby(["Variety"])

# Getting the mean of points because the distribution is close to a normal distribution
points_v_mean = varieties_group["Points"].mean()

# Getting the median of price because the distribution is skewed to the right
price_v_median = varieties_group["Price"].median()

In [None]:
# Creating a bar graph Median points by varietal
plt.bar(top_varieties, points_v_mean, align = "center", color = "gold", edgecolor ="black")

# Labeling the graph
plt.title("Average Rating Points by Variety")
plt.xlabel("Varieties")
plt.ylabel("Rating Points")
plt.xticks(rotation = 90)

# Making the y-values start at 85 to 90 because the range of the rating points began at 85. 
# Note: There is a payment threshold to get a wine rated; therefore, the rating points begin at 85.
plt.ylim(85, 90.5)
plt.show()

Using the average rating point per grape/wine variety a Nebbiolo has highest average rating. 

In [None]:
# Creating a bar graph Median Price by varietal
plt.bar(top_varieties, price_v_median, align = "center", color = "green", edgecolor ="black")

# Labeling the graph
plt.title("Median Price by Variety")
plt.xlabel("Varieties")
plt.ylabel("Price ($)")
plt.xticks(rotation = 90)
plt.show()

Using the median the Nebbiolo has the highest median price and Pinot Grigio has the lowest median price.

Finding the best wine based on taste, one should pick a wine that is a Nebbiolo.

## Can you predict the price of a Wine based on it's rating?

## All Wine

In [None]:
# Get the Price and Points of Wines
all_price = wine_df["Price"]
all_points = wine_df["Points"]

### Price by Rating

In [None]:
# Generate a scatter plot of the Price by Rating Points of all wines
plt.scatter(all_points, all_price, color = "green", edgecolor = "black", alpha = .4)

# Labeling the graph
plt.title(f"All Wine Price by Rating Points")
plt.xlabel("Points")
plt.ylabel("Price ($)")

plt.xlim(75,105)
plt.grid(True)

plt.show()

In [None]:
# Calculate the correlation coefficient
r = st.pearsonr(all_points, all_price)
r_sq = r[0]**2

# Calculate the linear regression model 
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(all_points, all_price)
regress_values = all_points * slope + intercept
line_eq = f' y = {round(slope,2)} x + {round(intercept,2)}'


print(f'The correlation coefficient of price and rating points for all wines {round(rvalue,2)}\n')
print(f'The coefficient of determination of price and rating points for all wines {round(r_sq,2)}\n')
print(line_eq)

In [None]:
# Generate a scatter plot of the Price by Rating Points of all wines
# with the linear regression line and equation

plt.plot(all_points,regress_values, "r-")
plt.scatter(all_points, all_price, color = "green", edgecolor = "black", alpha = .4)

# Labeling the graph with the Linear Regression
plt.title(f"All Wine Price by Rating Points")
plt.xlabel("Points")
plt.ylabel("Price ($)")

plt.xlim(75,105)
plt.grid(True)
plt.annotate(line_eq, (76,1500), fontsize = 14, color ="r")

plt.show()

With a correlation coeffiecent of 0.46 there is weak positive correlation between all wine prices and rating points.

## Wines Below $25

In [None]:
# Create a new dataframe with economical priced wine
low_price = 25
low_price_df = wine_df.loc[wine_df["Price"] <= low_price]
low_price_df

In [None]:
# Get the Price and Points of Economical Wines
lp_price = low_price_df["Price"]
lp_points = low_price_df["Points"]

### Price by Rating Points

In [None]:
# Generate a scatter plot of the Price by Rating Points of Economical wines
plt.scatter(lp_points, lp_price, color = "green", edgecolor = "black", alpha = .4)

# Labeling the graph
plt.title(f"Economical Wines (${low_price} and below) Price by Rating Points")
plt.xlabel("Points")
plt.ylabel("Price ($)")

plt.xlim(75,100)
plt.grid(True)

plt.show()

In [None]:
# Calculate the correlation coefficient
r = st.pearsonr(lp_points, lp_price)
r_sq = r[0]**2

# Calculate the linear regression model 
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(lp_points, lp_price)
regress_values = lp_points * slope + intercept
line_eq = f' y = {round(slope,2)} x + {round(intercept,2)}'


print(f'The correlation coefficient of price and rating points for all wines {round(rvalue,2)}\n')
print(f'The coefficient of determination of price and rating points for all wines {round(r_sq,2)}\n')
print(line_eq)

In [None]:
# Generate a scatter plot of the Price by Rating Points of Economical wines
# with the linear regression line and equation

plt.plot(lp_points,regress_values, "r-")
plt.scatter(lp_points, lp_price, color = "green", edgecolor = "black", alpha = .4)

# Labeling the graph with the Linear Regression
plt.title(f"Economical Wines (${low_price} and below) Price by Rating Points")
plt.xlabel("Points")
plt.ylabel("Price ($)")

plt.xlim(75,100)
plt.grid(True)
plt.annotate(line_eq, (89,3.5), fontsize = 14, color ="r")

plt.show()

Split the wines the prices less than $25, $25 is chosen because the median wines prices if 24. 

Looking the economical wines there is a 0.37 correlation coeffiecent meaning there is a weak positive correlation between cheaper wine prices and the rating points. So when finding a wine you could make a weak assumption that the higher the rating the more expensive.

## Wines Above $250

In [None]:
high_price = 250
high_price_df = wine_df.loc[wine_df["Price"] >= high_price]
high_price_df

In [None]:
# Get the Price and Points of Expensive Wines
hp_price = high_price_df["Price"]
hp_points = high_price_df["Points"]

In [None]:
# Generate a scatter plot of the Price by Rating Points of expensive wines
plt.scatter(hp_points, hp_price, color = "green", edgecolor = "black", alpha = .4)

# Labeling the graph
plt.title(f"Economical Wines (${high_price} and below) Price by Rating Points")
plt.xlabel("Points")
plt.ylabel("Price ($)")

plt.xlim()
plt.grid(True)

plt.show()

In [None]:
# Calculate the correlation coefficient
r = st.pearsonr(hp_points, hp_price)
r_sq = r[0]**2

# Calculate the linear regression model 
(slope, intercept, rvalue, pvalue, stderr) = st.linregress(hp_points, hp_price)
regress_values = hp_points * slope + intercept
line_eq = f' y = {round(slope,2)} x + {round(intercept,2)}'


print(f'The correlation coefficient of price and rating points for all wines {round(rvalue,2)}\n')
print(f'The coefficient of determination of price and rating points for all wines {round(r_sq,2)}\n')
print(line_eq)

In [None]:
# Generate a scatter plot of the Price by rating Points of Expensive Wines
# with the linear regression line and equation

plt.plot(hp_points,regress_values, "r-")
plt.scatter(hp_points, hp_price, color = "green", edgecolor = "black", alpha = .4)

# Labeling the graph with the Linear Regression
plt.title(f"Expensive Wines (${high_price} and below) Price by Rating Points")
plt.xlabel("Points")
plt.ylabel("Price ($)")

plt.xlim(79,101)
plt.grid(True)
plt.annotate(line_eq, (80,1000), fontsize = 14, color ="r")

plt.show()

Looking the expensive wines there is a 0.2 correlation coeffiecent meaning there is a negligible positive correlation between higher wine prices and the rating points. So when finding a wine it can't be easily assumed that an expensive wine with a high rating will be more expensive. 

## Are French and Italian wines comparable in rating and price?
#### Predict that French wine is better rated (based on the bar graph "Average Rating Points by Country") and French and Italian wine is same in price (based on the bar graph "Median Price by Country")

## France vs Italy

In [None]:
# Create a new dataframe with a Country
country_one = "France"

# Create a new dataframe with another Country
country_two = "Italy"

# Put countries to compare into a list for for loop 
country_comparisons = [country_one, country_two]
country_comparisons

### Comparing Rating Points

In [None]:
# Create empty list to fill with points(for plotting)
compare_points = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
#For loop the get the wine rating points of the wines in compared countries

for country in country_comparisons:
    wine_results = wine_df.loc[wine_df["Country"] == country]
    
    # Locate the rows which Country on Wine_df and get the wine rating points
  
    wine_points = wine_results["Points"]
    
    # Add subset
    
    compare_points.append(wine_points)
    
    # Determine outliers using upper and lower bounds using quartiles
  
    quartiles = wine_points.quantile([.25, .5, .75])
    lowerq = quartiles[0.25]
    median = quartiles[0.5]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq
    
    lower_bound = lowerq - (1.5 * iqr)
    upper_bound = upperq + (1.5 * iqr)
    
    print(f"The interquartile range of Rating points for {country} is: {round(iqr,2)}")
    print(f"The values below {round(lower_bound,2)} could be outlier for {country}")
    print(f"The median of rating points in {country} is {round(median,2)}")
    print(f"The values above {round(upper_bound,2)} could be outlier for {country}\n")

In [None]:
# Generate a box plot of the rating points of each wine in 2 compared countries
outlier = dict(markerfacecolor = 'gold', marker = "o")
fig1,ax1 = plt.subplots()
ax1.boxplot(compare_points, labels = country_comparisons, showfliers = True, flierprops = outlier)
ax1.set_title(f"Wine Rating Points of {country_one} and {country_two}")
ax1.set_ylim(75,105)
ax1.set_ylabel("Rating Points")
ax1.set_xlabel("Countries")
ax1.grid(True)
plt.show()

### Comparing Price

In [None]:
# Create empty list to fill with price(for plotting)
compare_price = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
#For loop the get the wine price of the wines in compared countries

for country in country_comparisons:
    wine_results = wine_df.loc[wine_df["Country"] == country]
    
    # Locate the rows which Country on Wine_df and get the wine price
  
    wine_price = wine_results["Price"]
    
    # Add subset
    
    compare_price.append(wine_price)
    
    # Determine outliers using upper and lower bounds using quartiles
  
    quartiles = wine_price.quantile([.25, .5, .75])
    lowerq = quartiles[0.25]
    median = quartiles[0.5]
    upperq = quartiles[0.75]
    iqr = upperq - lowerq
    
    lower_bound = lowerq - (1.5 * iqr)
    upper_bound = upperq + (1.5 * iqr)
    
    print(f"The interquartile range of Price for {country} is: {round(iqr,2)}")
    print(f"The values below {round(lower_bound,2)} could be outlier for {country}")
    print(f"The median of rating points in {country} is {round(median,2)}")
    print(f"The values above {round(upper_bound,2)} could be outlier for {country}\n")

In [None]:
# Generate a box plot of the price of each wine in 2 compared countries
outlier = dict(markerfacecolor = 'green', marker = "o")
fig1,ax1 = plt.subplots()
ax1.boxplot(compare_price, labels = country_comparisons, showfliers = True, flierprops = outlier)

ax1.set_title(f"Wine Price of {country_one} and {country_two}")
ax1.set_ylabel("Price ($)")
ax1.set_xlabel("Countries")

ax1.grid(True)
plt.show()

In [None]:
# Generate a box plot of the price of each wine in 2 compared countries
outlier = dict(markerfacecolor = 'green', marker = "o")
fig1,ax1 = plt.subplots()
ax1.boxplot(compare_price, labels = country_comparisons, showfliers = False)

ax1.set_title(f"Wine Price of {country_one} and {country_two}")
ax1.set_ylabel("Price ($)")
ax1.set_xlabel("Countries")

ax1.grid(True)
plt.show()

### Hypothesis Test - Independent Test

In [None]:
# Get the price and points of country one
wine_results_one = wine_df.loc[wine_df["Country"] == country_one]  
country_one_points = wine_results["Points"]
country_one_price = wine_results["Price"]

# Get the price and points of country one
wine_results_two = wine_df.loc[wine_df["Country"] == country_two]  
country_two_points = wine_results["Points"]
country_two_price = wine_results["Price"]

In [None]:
# Independent Test for country one and country two points
# Null - The mean of sample one and the mean of sample two are identical
st.ttest_ind(country_one_points, country_two_points)

With a p-value of 1 on an ANOVA test, we fail to reject the null; where the null hypothesis says the mean of sample one and the mean of sample two are identical. Therefore, we fail to reject that French and Italian wines have the same average rating points.

In [None]:
# Independent Test for country one and country two price
# Null - The mean of sample one and the mean of sample two are identical
st.ttest_ind(country_one_price, country_two_price)

With a p-value of 1 on an ANOVA test, we fail to reject the null; where the null hypothesis says the mean of sample one and the mean of sample two are identical. Therefore, we fail to reject that French and Italian wines have the same average price.

When choosing between French vs Italian wines their average rating points and price is similar. There isn't one that is better than the other.

In [None]:
# jeannaej's code ends here

In [None]:
# natalia's code starts here

In [None]:
# Understanding how many data points are in each column
wine_df.nunique()

### United States specific wine analysis

In [None]:
# Creating a DF with only us wines
US_only = wine_df.loc[wine_df["Country"] == "US"]
US_only

In [None]:
# Summing the entries to finding out how many times each varietal is mentioned in the data
us_var = US_only.groupby("Variety")
us_varieties = us_var.count()
us_varieties

In [None]:
# Summing all varieties that have less than 1000 data point; they will be lummped together
other = 0
for variety, row in us_varieties.iterrows():
    if row["Country"] < 1000:
        added = row["Country"]
        other = other + added
    else:
        pass
        
print(other)

In [None]:
# Returns the top 40 varietals from US wineries
us_var_top = us_varieties.loc[us_varieties["Country"] >= 1000]

In [None]:
us_var_top.head()

In [None]:
other_wine = pd.DataFrame([[other,other,other,other,other]], columns=list(["Country","Points","Price","Province","Region"]))
other_wine = other_wine.rename({"0":"Other (206 Varietals)"}) 
other_wine

In [None]:
us_var_top = us_var_top.append(other_wine, ignore_index=False)
us_var_top

In [None]:
us_var_top = us_var_top.sort_values(by="Country", ascending=False)
us_var_top

In [None]:
# Creating a pie chart that shows the prevelence of each varietal in teh US
# Labels for the sections of our pie chart
labels = us_var_top.index

# The values of each section of the pie chart
sizes = us_var_top["Country"]

# The colors of each section of the pie chart
# colors = ["red", "orange", "lightcoral", "lightskyblue"]

# Seperate the top 3 Varietals
explode = (0, 0.1, 0.1, 0.1, 0, 0, 0, 0, 0, 0, 0, 0)

In [None]:
plt.pie(sizes, labels=labels, autopct="%1.1f%%", shadow=True, explode=explode, radius=2.5)
plt.show()

In [None]:
# grouping the data by province to see how many entries there are for each province.
province = US_only.groupby("Province").count().sort_values(by="Country", ascending=False)
province

In [None]:
# Summing the entries to finding out how many times each province (region) is mentioned in the data
other_prov = 0
for prov, row in province.iterrows():
    if row["Country"] < 100:
        added_prov = row["Country"]
        other_prov = other_prov + added_prov
    else:
        pass
        
print(other_prov)

In [None]:
other_province = pd.DataFrame([[other_prov,other_prov,other_prov,other_prov,other_prov]], columns=list(["Country","Points","Price","Province","Region"]))
other_province

In [None]:
# Returns the top wine producing provinces in the US by looking at those with 1000 or more entries
province_top = province.loc[province["Country"] >1000]
province_top

In [None]:
# Adding the previoulsy deleted data back in as single summed data point
province_top.append(other_province, ignore_index=False)
province_top

In [None]:
# province_top.loc["Other Provinces (469 Provinces)"] = [other_prov,other_prov,other_prov,other_prov,other_prov]
# province_top = province_top.sort_values(by="Country", ascending=False)
# province_top

In [None]:
# Creating a pie chart that shows the prevelence of each province (region) in the US
# Labels for the sections of our pie chart
labels = province_top.index

# The values of each section of the pie chart
sizes = province_top["Country"]

# The colors of each section of the pie chart
# colors = ["red", "orange", "lightcoral", "lightskyblue"]

# Seperate the top 3 Varietals
explode = (0.1, 0, 0, 0, 0)

In [None]:
plt.pie(sizes, labels=labels, autopct="%1.1f%%", shadow=True, explode=explode, startangle=90, radius=2.5)
plt.show()

In [None]:
patches, text = plt.pie(sizes, shadow=True, explode=explode, startangle=90, radius=2.5)
    
plt.legend(patches, labels, loc="best", bbox_to_anchor=(-0.1,1.), fontsize=12)
plt.show()

# Avoiding overlapping labels
# https://stackoverflow.com/questions/23577505/how-to-avoid-overlapping-of-labels-autopct-in-a-matplotlib-pie-chart 
# Accessed 14 Feb 2021

### Which country produces "the biggest bang for your buck?"
* Identifying wines with with lower prices, but higher points.

In [None]:
# adding a new column to the complete DF that creates the ratio of points:price; the higher the value, the better the deal
ratio_df = wine_df
ratio_df["Points:Price"] = ratio_df["Points"]/ratio_df["Price"]
ratio_df

In [None]:
# idenifying the mean of th ration by country
mean = ratio_df.groupby("Country").mean()
mean

In [None]:
# idenifying the median of th ration by country
median = ratio_df.groupby("Country").median()
median

In [None]:
# adding the mean and median into the DF for easier comparison
mean_median = pd.merge(mean, median, left_index=True, right_index=True, suffixes=("_mean","_median"))
best = mean_median.sort_values(by="Points:Price_mean", ascending=False)
best

In [None]:
# plotting the price/points mean per country
x_axis_country = best.index
point_per = best["Points:Price_mean"]
plt.bar(x_axis_country, point_per, align="center")
plt.xticks(rotation=45)
plt.xlabel("Country")
plt.ylabel("Points per Dollar")
plt.title("Average Points per Dollar")

plt.show()

In [None]:
# natalia's code ends here

In [None]:
# sharon's code starts here

In [None]:
# Linear Regression... is there a correlation between price and rating
# Can we predict the rating based on price?

wine_df
us_wine_df = wine_df.loc[wine_df['Country'] == 'US']
# Dropping "extreme, obvious" outliers
us_wine_df = us_wine_df.loc[us_wine_df['Price'] < 500.0]
us_wine_df

In [None]:
# TODO: add the (x,y) values for the annotation so the equation shows up on the graph
# Use strict outliers?  Or just the extreme outliers... maybe try the strict first

def wine_regression(x_values, y_values, wine_variety):
    

    (slope, intercept, rvalue, pvalue, stderr) = st.linregress(x_values, y_values)
    regress_values = x_values * slope + intercept
    line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

    plt.scatter(x_values, y_values, marker="o", facecolor="maroon", alpha=0.4)
    plt.plot(x_values, regress_values, "r-")
    plt.annotate(line_eq, (80, 250), fontsize=15, color = "blue")
    plt.ylabel("Price of Wine (per bottle)")
    plt.xlabel("Rating of Wine (in Points)")
    plt.title(f"{wine_variety}: Wine Rating versus Price")
    plt.show()

In [None]:
y_values = us_wine_df['Price']
x_values = us_wine_df['Points']

wine_regression(x_values, y_values, 'All Varieties')

In [None]:
# Pinot Noir
us_pinot_df = us_wine_df.loc[us_wine_df['Variety'] == 'Pinot Noir']
x_values = us_pinot_df['Points']
y_values = us_pinot_df['Price']

wine_regression(x_values, y_values, 'Pinot Noir')

In [None]:
## find Price outliers for boxplot (see below)
pinots = np.asarray(us_pinot_df['Price'])

pinot_quartiles = pd.DataFrame(pinots).quantile([.25,.5,.75], axis=0)
pinot_quartiles

In [None]:
pinot_lowerq = pinot_quartiles.loc[0.25]  #TODO: this reference needs to be cleaned up
pinot_median = pinot_quartiles.loc[0.50]
pinot_upperq = pinot_quartiles.loc[0.75]
pinot_iqr = pinot_upperq - pinot_lowerq

print(f'The lower quartile of Price for Pinot Noir is: {pinot_lowerq[0]}')
print(f'The upper quartile of Price for Pinot Noir is: {pinot_upperq[0]}')
print(f'The interquartile range of Price for Pinot Noir is: {pinot_iqr[0]}')
print(f'The median of Price for Pinot Noir is: {pinot_median[0]}')

pinot_lower_bound = pinot_lowerq - (1.5*pinot_iqr)
pinot_upper_bound = pinot_upperq + (1.5*pinot_iqr)

print(f'Pinot Noir Price below {pinot_lower_bound[0]} could be outliers.')
print(f'Pinot Noir Price above {pinot_upper_bound[0]} could be outliers.')

In [None]:
## re-run the linear regression HERE - TODO

In [None]:
# Cabernet Sauvignon
us_cab_df = us_wine_df.loc[us_wine_df['Variety'] == 'Cabernet Sauvignon']
x_values = us_cab_df['Points']
y_values = us_cab_df['Price']

wine_regression(x_values, y_values, 'Cabernet Sauvignon')

In [None]:
## find Price outliers for boxplot (see below)
cabs = np.asarray(us_cab_df['Price'])  #Cabernet Sauvignon

cabs_quartiles = pd.DataFrame(cabs).quantile([.25,.5,.75], axis=0)
cabs_quartiles

In [None]:
cabs_lowerq = cabs_quartiles.loc[0.25]  #TODO: this reference needs to be cleaned up
cabs_median = cabs_quartiles.loc[0.50]
cabs_upperq = cabs_quartiles.loc[0.75]
cabs_iqr = cabs_upperq - cabs_lowerq

print(f'The lower quartile of Price for Cabernet Sauvignon is: {cabs_lowerq[0]}')
print(f'The upper quartile of Price for Cabernet Sauvignon is: {cabs_upperq[0]}')
print(f'The interquartile range of Price for Cabernet Sauvignon is: {cabs_iqr[0]}')
print(f'The median of Price for Cabernet Sauvignon is: {cabs_median[0]}')

cabs_lower_bound = cabs_lowerq - (1.5*cabs_iqr)
cabs_upper_bound = cabs_upperq + (1.5*cabs_iqr)

print(f'Cabernet Sauvignon Price below {cabs_lower_bound[0]} could be outliers.')
print(f'Cabernet Sauvignon Price above {cabs_upper_bound[0]} could be outliers.')

In [None]:
# Chardonnay  This is a good candidate for a box plot b/c of the outlier
us_chard_df = us_wine_df.loc[us_wine_df['Variety'] == 'Chardonnay']
us_chard_df = us_chard_df.loc[us_chard_df['Price'] < 250]

x_values = us_chard_df['Points']
y_values = us_chard_df['Price']

wine_regression(x_values, y_values, 'Chardonnay')

In [None]:
## find Price outliers for boxplot (see below)
chards = np.asarray(us_chard_df['Price'])  #Chardonnay

chard_quartiles = pd.DataFrame(chards).quantile([.25,.5,.75], axis=0)
chard_quartiles

In [None]:
chard_lowerq = chard_quartiles.loc[0.25]  
chard_median = chard_quartiles.loc[0.50]
chard_upperq = chard_quartiles.loc[0.75]
chard_iqr = chard_upperq - chard_lowerq

print(f'The lower quartile of Price for Cabernet Sauvignon is: {chard_lowerq[0]}')
print(f'The upper quartile of Price for Cabernet Sauvignon is: {chard_upperq[0]}')
print(f'The interquartile range of Price for Cabernet Sauvignon is: {chard_iqr[0]}')
print(f'The median of Price for Cabernet Sauvignon is: {chard_median[0]}')

chard_lower_bound = chard_lowerq - (1.5*chard_iqr)
chard_upper_bound = chard_upperq + (1.5*chard_iqr)

print(f'Chardonnay Price below {chard_lower_bound[0]} could be outliers.')
print(f'Chardonnay Price above {chard_upper_bound[0]} could be outliers.')

In [None]:
# Redraw Chard with upper outliers removed
###  this is doing weird spacing things above the graph
#us_chard_df = us_chard_df.loc[us_chard_df['Price'] < 62.0]

#x_values = us_chard_df['Points']
#y_values = us_chard_df['Price']

#wine_regression(x_values, y_values, 'Chardonnay')

In [None]:
# Syrah
us_syrah_df = us_wine_df.loc[us_wine_df['Variety'] == 'Syrah']
x_values = us_syrah_df['Points']
y_values = us_syrah_df['Price']

wine_regression(x_values, y_values, 'Syrah')

In [None]:
## find Price outliers for boxplot (see below)
syrahs = np.asarray(us_syrah_df['Price'])  #Syrah

syrah_quartiles = pd.DataFrame(syrahs).quantile([.25,.5,.75], axis=0)
syrah_quartiles

In [None]:
syrah_lowerq = syrah_quartiles.loc[0.25]  
syrah_median = syrah_quartiles.loc[0.50]
syrah_upperq = syrah_quartiles.loc[0.75]
syrah_iqr = syrah_upperq - syrah_lowerq

print(f'The lower quartile of Price for Cabernet Sauvignon is: {syrah_lowerq[0]}')
print(f'The upper quartile of Price for Cabernet Sauvignon is: {syrah_upperq[0]}')
print(f'The interquartile range of Price for Cabernet Sauvignon is: {syrah_iqr[0]}')
print(f'The median of Price for Cabernet Sauvignon is: {syrah_median[0]}')

syrah_lower_bound = syrah_lowerq - (1.5*syrah_iqr)
syrah_upper_bound = syrah_upperq + (1.5*syrah_iqr)

print(f'Syrah Price below {syrah_lower_bound[0]} could be outliers.')
print(f'Syrah Price above {syrah_upper_bound[0]} could be outliers.')

In [None]:
# Box plot for Pinot Noir, Cabernet Sauvignon, Chardonnay

columns = [pinots, cabs, chards, syrahs]

fig, ax = plt.subplots()
ax.set_title('Wine Varieties')
ax.set_ylabel('Price')  # per bottle?  update?
labels = ['Pinot Noir', 'Cabernet Sauvignon', 'Chardonnay', 'Syrah']
ax.boxplot(columns, labels=labels)
plt.show()

In [None]:
# sharon's code ends here