# Task 1: Get familier with the data
#Import the necessary libraries
#Read nfl.csv into a pandas DataFrame and examine it

In [None]:
#Python libraries for exploring and manipulating data
import pandas as pd  # linear algebra
import numpy as np # data processing

# read a dataset of nfl into a DataFrame
nfl = pd.read_csv("../input/nflplaybyplay2009to2016/NFL Play by Play 2009-2017 (v4).csv")
# set seed for reproducibility
np.random.seed(0) 

# examine the first 5 rows of the nfl data
nfl.head()

In [None]:
# Examine the shape of the data set
print(nfl.shape)

In [None]:
# Examine the data type
print(nfl.info())

In [None]:
# Examine the data type of all the columns entry
nfl.dtypes

In [None]:
#Examine the summary statistics of only integers of nfl dataframe
nfl.describe(include='int')

In [None]:
#Examine the summary statistics of only floats of nfl dataframe
nfl.describe(include='float')


In [None]:
#Examine the summary statistics of only objects of nfl dataframe
nfl.describe(include='object')

# Task 2: Handling missing values
#how many missing data points, the dataset have



In [None]:
# 'isnull' returns a DataFrame of booleans (True if missing, False if not missing)
nfl.isnull().head()

In [None]:
#new alias for isnull
nfl.isna().head()

In [None]:
# 'notnull' returns the opposite of 'isnull' (True if not missing, False if missing)
nfl.notnull().head()

In [None]:
#new alias for notnull
nfl.notna().head()

In [None]:
# how many total missing values do we have?
#total_cells
total_cells = np.product(nfl.shape)
# Get the number of missing data points per column
nfl_missing_values_sum_per_col = nfl.isnull().sum()
# Sum of the total missing values in the data set
total_missing = nfl_missing_values_sum_per_col.sum()
total_missing
# percent of data that is missing in the data set
(total_missing/total_cells) * 100

In [None]:
# Get the number of missing data points per column
nfl_missing_values_sum_per_col = nfl.isnull().sum().sort_values(ascending= False)
print("Sum of missing values/column in nfl dataset",nfl_missing_values_sum_per_col.head())

print('_'*40)
nfl_values_count_per_col = nfl.isnull().count()
print("Total number(count) of values in nfl dataset", nfl_values_count_per_col.head())


nfl_missing_values_percentage_per_col = (nfl_missing_values_sum_per_col/nfl_values_count_per_col)*100
print('_'*40)

missing_data = pd.concat([nfl_missing_values_sum_per_col,nfl_missing_values_percentage_per_col],axis=1,keys=['Total', 'Percent'])
missing_data.sort_values("Percent",ascending= False).head(10)

In [None]:
#we will first drop all the columns, which have missing values total more than 100
nfl_data = nfl.drop((missing_data[missing_data['Total'] > 100].index),1)
nfl_data.head()

In [None]:
#Examine the shape of new data set
nfl_data.shape

In [None]:
nfl_data.info()

In [None]:
#Examine how many missing values, the new dataset have
nfl_data.isnull().sum().max()

In [None]:
# 'value_counts' does not include missing values by default
nfl_data['desc'].value_counts().head(10)

In [None]:
# explicitly include missing values
nfl_data['desc'].value_counts(dropna=False).head()

In [1]:
#find out the which row has missing value or NaN
nfl_data.desc.loc[nfl_data.desc.isnull()]

In [None]:
# fill in missing values with a specified value
#nfl_data['desc'].fillna(value='VARIOUS', inplace=True)

In [None]:
# Now we will remove all columns which have any missing values
nfl_data =nfl_data.dropna(how='any',axis=1).head()
nfl_data.head()

In [None]:
#we don't want to repeat this process every time we work with the data set. 
#Let's save the tidied data file as a separate file
nfl_data.to_csv('nfl_data_new.csv',index= False )
nfl_data_new = pd.read_csv('nfl_data_new.csv')
nfl_data_new.head()

In [None]:
#Examine the shape of new data set again
nfl_data_new.shape

In [None]:
nfl_data_new.info()

In [None]:
# Again Examine how many missing values new dataset have
nfl_data_new.isnull().sum().max()