In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# Read the data

In [None]:
%time train = pd.read_csv('../input/train.csv')
train.shape, train.columns

In [None]:
%time test = pd.read_csv('../input/test.csv')
test.shape, test.columns

# Look at the data

## Overview

### 3147 integer columns, 1845 float columns (including target) and 1 string column (ID)

In [None]:
train.dtypes.value_counts()

### No missing values; all values are nonnegative

In [None]:
features = train.drop(columns=['ID', 'target'])
features.min().min(), features.max().max(), features.isnull().any().any()

In [None]:
test_features = test.drop(columns='ID').sample(n=features.shape[0], random_state=123)
test_features.min().min(), test_features.max().max(), test_features.isnull().any().any()

### Almost all entries are zeros (very sparse data)

In [None]:
plt.figure(figsize=(5,5))
plt.spy((features > 0).values);

In [None]:
(features == 0).sum().sum() / features.size * 100

In [None]:
plt.figure(figsize=(5,5))
plt.spy((test_features > 0).values);

In [None]:
(test_features == 0).sum().sum() / test_features.size * 100

### Columns carrying no information: 256 => can remove them right away

In [None]:
nunique = features.nunique()
no_info = nunique == 1
no_info.sum()

In [None]:
to_drop = nunique[no_info].index.values
train.drop(columns=to_drop, inplace=True)
features.drop(columns=to_drop, inplace=True)
test.drop(columns=to_drop, inplace=True)
test_features.drop(columns=to_drop, inplace=True)

### Duplicate rows: only two and with different targets

In [None]:
train.loc[features.duplicated(keep=False), ['ID', 'target']]

### Duplicate columns: in training, but not in test

In [None]:
trans = features.T
all_duplicates = trans[trans.duplicated(keep=False)].index
last_duplicates = trans[trans.duplicated()].index
all_duplicates, last_duplicates

In [None]:
test_sample = test_features.sample(n=features.shape[0], random_state=123)
trans_test = test_sample.T
trans_test[trans_test.duplicated(keep=False)].index

In [None]:
for i in range(len(all_duplicates)):
    for j in range(i + 1, len(all_duplicates)):
        col1, col2 = all_duplicates[i], all_duplicates[j]
        print(col1, col2, 'train:', sum(train[col1] != train[col2]), ' test:', sum(test_sample[col1] != test_sample[col2]))

## Target

### No zeros: starts from 30,000.

In [None]:
train.target.describe()

### Approximately linear on a log scale

In [None]:
fig, ax = plt.subplots()
plt.scatter(range(train.shape[0]), np.sort(train.target.values));
ax.set_yscale('log')

## Integer columns

In [None]:
int_cols = features.columns[features.dtypes == np.int64].values
int_train = features[int_cols]

### Very sparse data

In [None]:
plt.figure(figsize=(5,10))
plt.spy((int_train > 0).values);

In [None]:
(int_train == 0).sum().sum() / int_train.size * 100

### Number of unique values (could be lots of categorical data)

In [None]:
nunique_int = int_train.nunique()
fig, ax = plt.subplots()
nunique_int.hist(bins=300, bottom=0.1)
ax.set_xscale('log')

## Float columns

In [None]:
float_cols = features.columns[features.dtypes == np.float64].values
float_train = features[float_cols]

### Slightly less sparse than int

In [None]:
float_train = train[float_cols]
plt.figure(figsize=(5,10))
plt.spy((float_train > 0).values);

In [None]:
(float_train == 0).sum().sum() / float_train.size * 100

### The distribution of unique counts is notably different between floats and ints

In [None]:
nunique_float = float_train.nunique()
fig, ax = plt.subplots()
nunique_float.hist(bins=300, bottom=0.1)
ax.set_xscale('log')

> # Save the data in the feather format

In [None]:
train.target = np.log1p(train.target)
%time train.to_feather('train.feather')
%time train = pd.read_feather('train.feather')

In [None]:
# No space left in Kaggle, but can be done locally
# %time test.to_feather('test.feather')
# %time test = pd.read_feather('test.feather')