#Mutual Information by Ryan Holbrook

All script by Ryan Holbrook https://www.kaggle.com/mpwolke/exercise-mutual-information/edit

Feature Engineering Kaggle Micro-course by Ryan Holbrook.

https://www.kaggle.com/learn/feature-engineering

![](https://i.ytimg.com/vi/U9h1xkNELvY/maxresdefault.jpg)youtube.com

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.feature_selection import mutual_info_regression

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [None]:
nRowsRead = 1000 # specify 'None' if want to read whole file
df = pd.read_csv('../input/cusersmarildownloadsgermancsv/german.csv', delimiter=';', encoding = "ISO-8859-2", nrows = nRowsRead)
df.dataframeName = 'german.csv'
nRow, nCol = df.shape
print(f'There are {nRow} rows and {nCol} columns')
df.head()

#Spoiler alert No Missing Values 

In [None]:
# Utility functions from Tutorial
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    color = np.array(["C0"] * scores.shape[0])
    # Color red for probes
    idx = [i for i, col in enumerate(scores.index)
           if col.startswith("PROBE")]
    color[idx] = "C3"
    # Create plot
    plt.barh(width, scores, color=color)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")

#Below, X=Value and col= Variable are Not columns of the Dataset. Therefore, Do Not change them!

In [None]:
features = ["Account_Balance", "Credit_Amount", "Duration_of_Credit_monthly"]
sns.relplot(
    x="value", y="Creditability", col="variable", data=df.melt(id_vars="Creditability", value_vars=features), facet_kws=dict(sharex=False),
);

In [None]:
X = df.copy()
y = X.pop('Creditability')

mi_scores = make_mi_scores(X, y)

In [None]:
print(mi_scores.head(20))
# print(mi_scores.tail(20))  # uncomment to see bottom 20

In [None]:
plt.figure(dpi=100, figsize=(8, 5))
plot_mi_scores(mi_scores.head(20))
# plot_mi_scores(mi_scores.tail(20))  # uncomment to see bottom 20

In [None]:
sns.catplot(x="Creditability", y="Duration_of_Credit_monthly", data=df, kind="boxen");
#plt.xticks(rotation=45) It didn't make better
plt.title("Creditability & Duration of Credit Monthly")
#plt.xticks(rotation=45)

In [None]:
sns.catplot(x="Creditability", y="Guarantors", data=df, kind="boxen");
plt.title("Creditability & Guarantors")

In [None]:
sns.catplot(x="Creditability", y="Account_Balance", data=df, kind="boxen");
plt.title("Creditability & Account Balance")

In [None]:
feature = "Credit_Amount"

sns.lmplot(
    x=feature, y="Creditability", hue="Guarantors", col="Account_Balance",
    data=df, scatter_kws={"edgecolor": 'w'}, col_wrap=3, height=4,
);

In [None]:
feature = "Creditability"

sns.lmplot(
    x=feature, y="Age_years", hue="Payment_Status_of_Previous_Credit", col="Account_Balance",
    data=df, scatter_kws={"edgecolor": 'w'}, col_wrap=3, height=4,
);

In [None]:
#Code by Olga Belitskaya https://www.kaggle.com/olgabelitskaya/sequential-data/comments
from IPython.display import display,HTML
c1,c2,f1,f2,fs1,fs2=\
'#eb3434','#eb3446','Akronim','Smokum',30,15
def dhtml(string,fontcolor=c1,font=f1,fontsize=fs1):
    display(HTML("""<style>
    @import 'https://fonts.googleapis.com/css?family="""\
    +font+"""&effect=3d-float';</style>
    <h1 class='font-effect-3d-float' style='font-family:"""+\
    font+"""; color:"""+fontcolor+"""; font-size:"""+\
    str(fontsize)+"""px;'>%s</h1>"""%string))
    
    
dhtml('Be patient. Tô na área, @mpwolke war Hier.' )