### Info
Data Source: http://archive.ics.uci.edu/ml/datasets/HTRU2#

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
df = pd.read_csv('../data/HTRU_2.csv')

In [None]:
def get_variable_category(series):
    '''Acquire category of variables: numerical, date, text or categorical. Credit: Sian Lewis'''
    unique_count = series.nunique(dropna=False)
    total_count = len(series)
    if pd.api.types.is_numeric_dtype(series):
        return 'Numerical'
    elif pd.api.types.is_datetime64_dtype(series):
        return 'Date'
    elif unique_count == total_count:
        return 'Text (Unique)'
    else:
        return 'Categorical'


def print_variable_categories(dataframe):
    '''Print categories for each variable. Credit: Sian Lewis'''
    for column_name in dataframe.columns:
        print(column_name, ': ', get_variable_category(dataframe[column_name]))

def missing_values_table(df):
    '''Print missing values of dataframe in table for each variable, Credit: Sian Lewis'''
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
    mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
    "There are " + str(mis_val_table_ren_columns.shape[0]) +
    " columns that have missing values.")
    return mis_val_table_ren_columns        

def datainspect(dataframe):
    '''Print data exploration information. Credit: Ritika Basher, Matt Speck, Sian Lewis'''
    print('ROWS AND COLUMNS: \n', dataframe.shape, '\n')
    print('MISSING VALUES: \n{}\n'.format(dataframe.isnull().sum()))
    print('DUPLICATE ROWS \n', dataframe.duplicated().sum(), '\n')
    print('DATA TYPES: \n' + str(dataframe.dtypes) + "\n")
    print('DATAFRAME DESCRIBE: \n \n', dataframe.describe(include='all'), '\n')
    print('UNIQUE VALUES:')
    for item in dataframe:
        print(item, dataframe[item].nunique())
    print('\n')
    print('VARIABLE CATEGORIES:', '\n')
    print(print_variable_categories(dataframe))
    print('MISSING MATRIX:', '\n')
    print(missing_values_table(dataframe))
#     print(dataframe.head())

def print_full_summary(dataframe):
    print_variable_categories(dataframe)
    print('-------------------------------------')
    missing_values_table(dataframe)
    print('-------------------------------------')
    datainspect(dataframe)

In [None]:
print_full_summary(df)

In [None]:
df.describe()

In [None]:
int_only = df[['integrated_mean', 'integrated_std', 'integrated_kurtosis', 'integrated_skew', 'is_pulsar']]

In [None]:
int_only.head()

In [None]:
# sns.PairGrid(int_only)
sns.pairplot(data=int_only, hue='is_pulsar')#, plot_kws=dict(scatter_kws = {'alpha': 0.1}))

In [None]:
# my_dict = dict(scatter_kws = {'alpha': 0.1})

# my_dict['scatter_kws']['alpha']

In [None]:
dmsnr_only = df[['dmsnr_mean', 'dmsnr_std', 'dmsnr_kurtosis', 'dmsnr_skew', 'is_pulsar']]

In [None]:
sns.pairplot(data=dmsnr_only, hue='is_pulsar')#, plot_kws=dict(scatter_kws = {'alpha': 0.1}))

In [None]:
plt.scatter(df['integrated_mean'], df['dmsnr_mean'], c=df['is_pulsar'])

In [None]:
plt.scatter(df['integrated_skew'], df['dmsnr_skew'], c=df['is_pulsar'])

In [None]:
plt.figure(figsize=(10,8))
plt.hist([df['integrated_skew'], df['dmsnr_skew']], bins=30, stacked=True);

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(df['integrated_kurtosis'], df['dmsnr_kurtosis'], c=df['is_pulsar'], alpha=.1);

In [None]:
plt.figure(figsize=(10,8))
plt.hist([df['integrated_kurtosis'], df['dmsnr_kurtosis']], bins=30, stacked=True);