# Solutions for pandas exercises

In [None]:
%matplotlib inline

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Tips

In [None]:
tips = sns.load_dataset("tips")

In [None]:
tips

In [None]:
replace = {"Thur": "Thursday", "Sun": "Sunday", "Sat": "Saturday", "Fri": "Friday"}
tips["day"] = tips["day"].replace(replace)

In [None]:
sns.relplot(x="tip", y="total_bill", data=tips, markers=True, style="day",
            hue="day", col="sex")

In [None]:
tips["day"] = tips["day"].astype("category")
sns.relplot(x="total_bill", y="tip", hue="day", style="sex", data=tips)

In [None]:
plt.savefig("../output/tips.pdf")

## Occupations

In [None]:
FNAME = "https://raw.githubusercontent.com/justmarkham/DAT8/master/data/u.user"
df = pd.read_csv(FNAME, sep="|")

In [None]:
df = df.set_index("user_id")

In [None]:
print(df.tail(10))
print(df.head(25))

In [None]:
print(df.info()) ## df.dtypes

In [None]:
counts = df["age"].value_counts()
min_value = counts.min()
print(counts[counts == min_value].index)

In [None]:
occ_counts = df["occupation"].value_counts()

In [None]:
type(occ_counts)

In [None]:
print(occ_counts.shape)
print(occ_counts[0])

In [None]:
occ_counts = occ_counts.sort_index()
fig, ax = plt.subplots()
occ_counts.plot.bar(ax=ax)

In [None]:
fig.savefig("../output/occupations.pdf")

## Euro 2012

In [None]:
df = pd.read_csv("../data/Euro_2012.csv", index_col='Team')

In [None]:
print(df.index.nunique())
print(df.sort_values("Shooting Accuracy", ascending=False).index[0])

In [None]:
print(df.loc["England"]["Penalty goals"])
print(df.loc[["England", "Italy", "Russia"]]["Shooting Accuracy"])

In [None]:
df.info()

In [None]:
df[["Saves-to-shots ratio", "Passing Accuracy", "Shooting Accuracy", "% Goals-to-shots"]].head()

In [None]:
df.info()

In [None]:
non_numeric_cols = ['Shooting Accuracy', 'Passing Accuracy', '% Goals-to-shots']
for col in non_numeric_cols:
    df[col] = df[col].str.replace("%", "").astype(float)

In [None]:
df.plot(x='Shooting Accuracy', y='Passing Accuracy', kind="scatter")

In [None]:
df.sort_values("Shots on target", ascending=False).index[1]

In [None]:
df.drop("Italy").sort_values("Shots on target", ascending=False).index[1]

In [None]:
discipline = df[["Yellow Cards", "Red Cards"]]
discipline = discipline.sort_values(by=['Red Cards', 'Yellow Cards'], ascending=False)
discipline.to_csv("./output/discipline.tsv", sep="\t")

## Alcohol

In [None]:
df = pd.read_csv("../data/drinks.csv")
df.head()

In [None]:
grouped = df.groupby(["continent"])
cols = ["beer_servings", "wine_servings"]
grouped[cols].mean().sort_values(cols, ascending=False).index[0]

In [None]:
new = df.melt(id_vars="continent", var_name="type",
              value_vars=["beer_servings", "spirit_servings", "wine_servings"],
              value_name="consumption")
sns.catplot(x="continent", y="consumption", col="type", data=new)

In [None]:
plt.savefig("../output/alcohol.pdf")

## Iris

In [None]:
iris = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", header=None)

In [None]:
iris.columns = ['sepal length (in cm)', 'sepal width (in cm)', 'petal length (in cm)', 'petal width (in cm)', 'class']

In [None]:
from numpy import nan

In [None]:
iris.loc[10:30, 'petal length (in cm)'] = nan

In [None]:
iris['petal length (in cm)'].isna().sum()

In [None]:
iris = iris.fillna(1.0)

In [None]:
iris.to_csv('./out/iris.csv', index=False)

In [None]:
count = iris.select_dtypes("float")
sns.catplot(data=count)
#fig.savefig('../output/iris.pdf')

## Memory

In [None]:
df = pd.read_csv("https://query.data.world/s/wsjbxdqhw6z6izgdxijv5p2lfqh7gx")

In [None]:
print(df.info())
print(df.info(memory_usage="deep"))

In [None]:
df_copy = df.copy().select_dtypes(include=[object])

In [None]:
df_copy.describe()

In [None]:
CUTOFF = 0.49*df.shape[0]
few_unique = [col for col in df_copy.columns if df_copy[col].nunique() <= CUTOFF]

In [None]:
for col in few_unique:
    df[col] = df[col].astype('category')

In [None]:
print(df.info(memory_usage="deep"))

In [None]:
df["v_line_score"] = df["v_line_score"].astype(str)  # temporary fix because of bug https://issues.apache.org/jira/browse/ARROW-14087

In [None]:
df.to_csv("../output/large_file.csv")
df.to_feather("../output/large_file.ftr")