### Notebook for preprocessing and cleaning data

Following steps are performed on the data:
1. Find and process any null columns or rows.
2. Find and process any duplicate columns or rows.


In [30]:
## Imports
import pandas as pd

In [31]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

print(f"Train {train.shape}")
print(f"Test {test.shape}")
train.head()

Train (10886, 12)
Test (6493, 9)


Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


### Convert datetime and added seprate features for month, year and hour

In [32]:
## Lets convert datetime column into datetime type
# Train
train.datetime = pd.to_datetime(train.datetime)
train['year'] = train.datetime.dt.year  # add year columns
train['month'] = train.datetime.dt.month # add month column
train['day'] = train.datetime.dt.day # add day of the month column
train['hour'] = train.datetime.dt.hour # add hour
train['registered_percent'] = (train.registered / train['count']) * 100 # add registered bike percentage
train['casual_percent'] = (train.casual / train['count']) * 100 # add casual bike rented percentage
train.drop('datetime', axis=1, inplace=True) # drop datetime column
print(f"Train {train.shape}")
# Test
# Store the test datetime to be used for submissions
test_datetime = test.datetime.copy()
test.datetime = pd.to_datetime(test.datetime)
test['year'] = test.datetime.dt.year  # add year columns
test['month'] = test.datetime.dt.month # add month column
test['day'] = test.datetime.dt.day # add day of the month column
test['hour'] = test.datetime.dt.hour # add hour
test.drop('datetime', axis=1, inplace=True) # drop datetime column
print(f"Test {test.shape}")
train.head()

Train (10886, 17)
Test (6493, 12)


Unnamed: 0,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,year,month,day,hour,registered_percent,casual_percent
0,1,0,0,1,9.84,14.395,81,0.0,3,13,16,2011,1,1,0,81.25,18.75
1,1,0,0,1,9.02,13.635,80,0.0,8,32,40,2011,1,1,1,80.0,20.0
2,1,0,0,1,9.02,13.635,80,0.0,5,27,32,2011,1,1,2,84.375,15.625
3,1,0,0,1,9.84,14.395,75,0.0,3,10,13,2011,1,1,3,76.923077,23.076923
4,1,0,0,1,9.84,14.395,75,0.0,0,1,1,2011,1,1,4,100.0,0.0


### NaN values

In [34]:
print(f"Does train data have any row have NaN values: {train.isnull().any().any()}")
print(f"Does test data have any row have NaN values: {test.isnull().any().any()}")

Does train data have any row have NaN values: False
Does test data have any row have NaN values: False


### Duplicate row or columns

In [40]:
# duplicate rows
print(f"Train duplicate rows: {train.duplicated().any()}")
print(f"Test duplicate rows: {train.duplicated().any()}")
print()
# duplicate columns
print(f"Train duplicate columns: {train.T.duplicated().any()}")
print(f"Test duplicate columns: {train.T.duplicated().any()}")

Train duplicate rows: False
Test duplicate rows: False

Train duplicate columns: False
Test duplicate columns: False
