In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 1.0 Matplotlib Basics 

## Was ist Matplotlib? 
Matplotlib ist eine Programmbibliothek für die Programmiersprache Python, die es erlaubt mathematische Darstellungen aller Art anzufertigen.

### <font color='red'>plt.plot()</font> $\Rightarrow$ Ermöglicht uns simple mathematische Zusammenhänge zu visualisieren

### Schema: plt.plot([x], y, [fmt], ...)

In [None]:
plt.plot([1,2,3,4]);

X ist optional, da der Index (die Stelle an der y in bspw. einer Liste steht), implizit als Wert angenommen wird!

In [None]:
x = [1, 2, 3, 4]
y = [i ** 2 for i in x]
plt.plot(x,y);

## Bringen wir <font color='green'>Farben</font> <font color='blue'>und</font> <font color='red'>Formen</font> ins Spiel

Für kontinuierliche Datenpunkte (Linien) gibt es folgende Arten der Visualisierung: 

* linestyle = ('-', '-', '-.', ':', '')
* linestyle = ('solid', 'dotted', 'dashed', 'dashdot', 'loosely dotted', 'densely dotted', ...)


Für Marker gibt es folgende Arten der Visualisierung: 

* marker = ('o', 'v', '^', '<', '>', '8', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X')


### Gekennzeichnet wird im Aufruf der Funktion wie folgt: 
* plt.plot(x, y, <font color='red'>'o'</font>)
* plt.plot(x, y, <font color='red'>'--'</font>)

In [None]:
plt.plot(x, y, 'o')
plt.show();
plt.plot(x, y, marker='X')
plt.show();
plt.plot(x, y, '--')
plt.show();
plt.plot(x, y, linestyle='dashed')
plt.show();

### <font color='red'>plt.show()</font> $\Rightarrow$ Zeigt den tatsächlichen Plot (für Notebooks nicht so wichtig, da ohnehin angezeigt. Für Skript wichtig, da Ergebnis sonst zwar erstellt, aber nicht angezeigt wird) an.

In [None]:
y = [1,2,3,4,5]
plt.plot(y);

In [None]:
y = [1,2,3,4,5]
plt.plot(y)
plt.show;

### <font color='red'>plt.figure()</font> $\Rightarrow$ Erlaubt uns Anpassungen an der Form und Größe der Plots vorzunehmen.

### Schema: figure(num=None, figsize=None, dpi=None, facecolor=None, edgecolor=None, frameon=True)

In [None]:
plt.figure(figsize=(10,10))
plt.plot(y)
plt.show()

### <font color='red'>plt.legend()</font> $\Rightarrow$ Erlaubt uns Anpassungen an den Labels der Datenpunkte innerhalb eines Plots vorzunehmen. 

### Schema: legend([handles], labels, [loc], [prop], [fontsize], [facecolor], ...)

In [None]:
custom_label = "Ein paar Zahlen"
plt.figure(figsize=(10,10))
plt.plot(y, label=custom_label)
plt.legend(loc='upper left', fontsize=20)
plt.show()

# Aufgabe 1: 

Zeichne eine Exponential-Funktion $f(x) = e^x $ für $x$ in $\{1,2,...,20\}$ <br>
Hint: Numpy bietet die Funktion np.exp() an. 

## 1.1 Bar Graph

In [None]:
x = np.arange(1950, 2020)

In [None]:
y = 2 * np.random.randn(1, len(x))

In [None]:
plt.bar(x,y[0])
plt.show();

In [None]:
df = pd.DataFrame(y[0], dtype='float', columns=['y'], index=x)

In [None]:
# Ziel: Alle über 0 rot färben, alle unter null grau färben 
clrs = ['red' if x > 0 else 'grey' for x in df.y]

# Ziel Alle die über dem Mittelwert sind rot färben und alle unter Mittelwert grau färben
#clrs = ['red' if (x >= np.mean(df.y)) else 'grey' for x in df.y]

In [None]:
plt.bar(df.index, df.y, color=clrs)

In [None]:
# Größe ändern 
plt.figure(figsize=(20,10))
plt.bar(df.index, df.y, label='Original')
plt.bar(df.index, -df.y, label='Inversed')
plt.legend()
plt.show()

## 1.2 Line Chart (Stacked Plot)

In [None]:
rng = np.arange(50)
rnd = np.random.randint(0, 10, size=(3, rng.size))
yrs = 1950 + rng

fig, ax = plt.subplots(figsize=(5, 3))
ax.stackplot(yrs, rng + rnd, labels=['Eastasia', 'Eurasia', 'Oceania'])
ax.set_title('Combined debt growth over time')
ax.legend(loc='upper left')
ax.set_ylabel('Total debt')
ax.set_xlim(xmin=yrs[0], xmax=yrs[-1])
fig.tight_layout()

In [None]:
first_value = plt.plot(yrs, rnd[0] + rng)

In [None]:
second_value = plt.plot(yrs, rnd[1] + rng)

In [None]:
second_value = plt.plot(yrs, rnd[2] + rng)

In [None]:
x = [1, 2, 3, 4, 5]
y1 = [1, 1, 2, 3, 5]
y2 = [0, 4, 2, 6, 8]
y3 = [1, 3, 5, 7, 9]

y = np.vstack([y1, y2, y3])

labels = ["Fibonacci ", "Evens", "Odds"]

fig, ax = plt.subplots()
ax.stackplot(x, y1, y2, y3, labels=labels)
ax.legend(loc='upper left')
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.stackplot(x, y, labels=labels)
ax.legend(loc='upper left')
plt.show();

In [None]:
y = np.random.randn(1,100000)
y_2 = np.random.normal(10,0.5,100000)

## 1.3 Histogram

Scaling mit xlim und ylim --> Unterschied in der Visualisierung der gleichen Verteilung kann riesig sein. 

In [None]:
plt.hist(y[0], bins=10);

In [None]:
plt.hist(y[0], bins=1000);

In [None]:
plt.hist(y_2, bins=1000)
plt.xlim(1,20);

In [None]:
plt.hist(y_2, bins=1000)
plt.xlim(10,11)
plt.ylim(0,200);

In [None]:
mu, sigma = 10, 1.2
s = np.random.normal(mu, sigma, 10000)
count, bins, ignored = plt.hist(s, 300, density=True)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) *
               np.exp( - (bins - mu)**2 / (2 * sigma**2) ),
         linewidth=2, color='r')
plt.show()

# Short introduction to Pandas 

In [None]:
import pandas as pd

In [None]:
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv'

In [None]:
## Importiere einen Datensatz 
iris = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')

# Dataframe? Was ist das? 

* Here is how the Technical Definition looks like <br>
->    *Pandas DataFrame is a 2-D labeled data structure with columns of potentially different type.*
* And here is how you should understand it <br>
->    *Pandas DataFrame is nothing but an in-memory representation of an excel sheet via Python programming language*

## 1.0 describe() 
Gibt Statistiken für numerische Variablen innerhalb eines Dataframes in Form einer Zusammenfassung aus

In [None]:
iris.describe()

## 2.0 info()
Gibt informationen zu den einzelnen Spalten aus --> Fehlende Werte, Datentyp, Memory Usage, etc. 

In [None]:
iris.info()

## 3.0 Überblick über Daten verschaffen mit head(), tail(), etc. 

In [None]:
iris.head()

In [None]:
iris.tail()

In [None]:
iris.isnull().sum()

# EDA or Explanatory Data Analysis

In [None]:
# https://www.kaggle.com/ekami66/detailed-exploratory-data-analysis-with-python

In [None]:
from sklearn.datasets import load_boston

In [None]:
load_boston()

boston_data = load_boston()
# Load boston into a dataframe and set the field names
df = pd.DataFrame(boston_data['data'], columns=boston_data['feature_names'])

In [None]:
print(boston_data.DESCR)

In [None]:
# Hier werden die y Werte gespeichert
price = boston_data.target
df['Price'] = price

In [None]:
# Statistische Zusammenfassung der Werte im df 
df.describe()

In [None]:
# Info über fehlende Werte, Datentypen, etc. 
df.info()

In [None]:
df.plot(kind='scatter', x='RM', y='Price');

In [None]:
df.plot(kind='scatter', x='LSTAT', y='Price');

In [None]:
# Histograms
df.hist('Price')
plt.show()

In [None]:
# Histograms
df.hist('RM')
plt.show()

In [None]:
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm', axis=None)

In [None]:
corr_bound = corr.where(corr>0.70)
corr_bound.style.background_gradient(cmap='coolwarm', axis=None)

In [None]:
import seaborn as sns

threshold = 0.9
#df_corr = df.corr(method='pearson', min_periods=1)
#df_correlated = ~(df_corr.mask(np.tril(np.ones([len(df_corr)]*2, dtype=bool))).abs() > threshold).any()
#corr_idx = df_correlated.loc[df_correlated[df_correlated.index] == False].index
#df_out = df[corr_idx]




high_corr = df_corr.abs() > threshold
s = high_corr.stack()
s_final = s[s].index.tolist()

In [None]:
corr_matrix = df.corr().abs()
high_corr_var=np.where(corr_matrix>0.8)
high_corr_var=[(corr_matrix.columns[x],corr_matrix.columns[y]) for x,y in zip(*high_corr_var) if x!=y and x<y]
high_corr_var

In [None]:
def trimm_correlated(df_in, threshold):
    df_corr = df_in.corr(method='pearson', min_periods=1)
    df_not_correlated = ~(df_corr.mask(np.tril(np.ones([len(df_corr)]*2, dtype=bool))).abs() > threshold).any()
    un_corr_idx = df_not_correlated.loc[df_not_correlated[df_not_correlated.index] == True].index
    df_out = df_in[un_corr_idx]
    return df_out

In [None]:
trimm_correlated(df, 0.6)

In [None]:
import seaborn as sns
from sklearn import preprocessing
# Let's scale the columns before plotting them against MEDV
min_max_scaler = preprocessing.MinMaxScaler()
column_sels = ['LSTAT', 'INDUS', 'NOX', 'PTRATIO', 'RM', 'TAX', 'DIS', 'AGE']
x = df.loc[:,column_sels]
y = df['Price']
x = pd.DataFrame(data=min_max_scaler.fit_transform(x), columns=column_sels)
fig, axs = plt.subplots(ncols=4, nrows=2, figsize=(20, 10))
index = 0
axs = axs.flatten()
for i, k in enumerate(column_sels):
    sns.regplot(y=y, x=x[k], ax=axs[i])
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=5.0)

In [None]:
df['Price'].quantile(0.05)
df['Price'].quantile(0.95)
df['Price'].quantile(0.5) # median 

In [None]:
df.boxplot()

In [None]:
plt.boxplot(df['Price'], showmeans=True)
plt.show()

In [None]:
plt.boxplot(df['RM'], showmeans=True, notch=True)
plt.show()

In [None]:
plt.boxplot(df['AGE'], showmeans=True)
plt.show()

In [None]:
plt.boxplot(df['TAX'], showmeans=True)
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

def f(x, sigma):
    epsilon = np.random.randn(*x.shape) * sigma
    return 10 * np.sin(2 * np.pi * (x)) + epsilon

train_size = 10000
noise = 1.7

X = np.linspace(-0.5, 0.5, train_size).reshape(-1, 1)
y = f(X, sigma=noise).ravel()
y_true = f(X, sigma=0.0)

In [None]:
from sklearn.svm import SVR
svr_rbf = SVR(kernel='rbf', C=1000, gamma=0.3, epsilon=.2)
svr_rbf.fit(X, y)

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(X, y, alpha=0.3)
plt.plot(X, svr_rbf.predict(X), '-', label='Prediction', color='r')
plt.plot(X, y_true, '--', label='Ground Truth')
plt.title('Fit a Function to Training Data and Predict Values')
plt.legend();

In [None]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

def f(x, sigma):
    epsilon = np.random.randn(*x.shape) * sigma
    return 10 * np.cos(2 * np.pi * (x)) + epsilon

train_size = 1000
noise = 1.7

X = np.linspace(-0.5, 0.5, train_size).reshape(-1, 1)
y = f(X, sigma=noise).ravel()
y_true = f(X, sigma=0.0)

In [None]:
from sklearn.svm import SVR
svr_rbf = SVR(kernel='rbf', C=10000, gamma=0.1, epsilon=0.1)
svr_rbf.fit(X, y)

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(X, y, alpha=0.3)
plt.plot(X, svr_rbf.predict(X), '-', label='Prediction', color='r')
plt.plot(X, y_true, '--', label='Ground Truth')
plt.title('Fit a Function to Training Data and Predict Values')
plt.legend();