## CH-04: Beginning Data Analysis 

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 6, 'display.max_rows', 6, 'display.max_colwidth', 12)

## Introduction

## Developing a data analysis routine

In [None]:
college = pd.read_csv('../data/college.csv')
college.sample(random_state=42)
college.head()

In [None]:
# Get the dimensions of the DataFrame with the .shape attribute:

college.shape

In [None]:
# List the data type of each column, the number of non-missing values, and memory
# usage with the .info method:

college.info()

In [None]:
college.describe()

In [None]:
# Get summary statistics for the numerical columns and transpose the DataFrame 
# for more readable output:

college.describe(include=[np.number]).T

In [None]:
college.describe(include=[pd.Categorical]).T

In [None]:
college.describe(include=[object]).T

In [None]:
# It is possible to specify the exact quantiles returned from the .describe method 
# when used with numeric columns:

college.describe(include=[np.number],
   percentiles=[.01, .05, .10, .25, .5,
                .75, .9, .95, .99]).T

## Data dictionaries

In [None]:
# A crucial part of data analysis involves creating and maintaining a data dictionary. 

# A data dictionary is a table of metadata and notes on each column of data. One of the 
# primary purposes of a data dictionary is to explain the meaning of the column names. 

# The college dataset uses a lot of abbreviations that are likely to be unfamiliar to an 
# analyst who is inspecting it for the first time.


In [None]:
# A data dictionary for the college dataset is provided in the following college_data_
# dictionary.csv file:

pd.read_csv('../data/college_data_dictionary.csv')

## Reducing memory by changing data types

In [None]:
# After reading in our college dataset, we select a few columns of different data types
# that will clearly show how much memory may be saved:

In [None]:
college = pd.read_csv('../data/college.csv')
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER', 'INSTNM', 'STABBR']

In [None]:
# This command is selecting all rows from the DataFrame college but only 
# the columns specified in the different_cols variable. 

# The variable different_cols is expected to contain the column labels that 
# you want to extract from the DataFrame.

col2 = college.loc[:, different_cols]
col2.head()

In [None]:
#  Inspect the data types of each column:

col2.dtypes

In [None]:
# Find the memory usage of each column with the .memory_usage method:

original_mem = col2.memory_usage(deep=True)
original_mem

In [None]:
# There is no need to use 64 bits for the RELAFFIL column as it contains only 0 or 1.
# Let's convert this column to an 8-bit (1 byte) integer with the .astype method:

col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)    

In [None]:
col2.dtypes

In [None]:
college[different_cols].memory_usage(deep=True)

In [None]:
# To save even more memory, you will want to consider changing object data types to
# categorical if they have a reasonably low cardinality (number of unique values). Let's
# first check the number of unique values for both the object columns:


col2.select_dtypes(include=['object']).nunique()

In [None]:
# The STABBR column is a good candidate to convert to categorical as less than one
# percent of its values are unique:

col2['STABBR'] = col2['STABBR'].astype('category')
col2.dtypes

In [None]:
# Compute the memory usage again:

new_mem = col2.memory_usage(deep=True)
new_mem

In [None]:
# Finally, let's compare the original memory usage with our updated memory usage.
# The RELAFFIL column is, as expected, an eighth of its original size, while the
# STABBR column has shrunk to just three percent of its original size:

new_mem / original_mem

## Selecting the smallest of the largest

In [2]:
# This recipe can be used to create catchy news headlines such as Out of the Top 100
# Universities, These 5 have the Lowest Tuition, or From the Top 50 Cities to Live, 
# these 10 are the Most Affordable.

In [3]:
# During analysis, it is possible that you will first need to find a grouping of data 
# that contains the top n values in a single column and, from this subset, find the 
# bottom m values based on a different column.

In [None]:
# In this recipe, we find the five lowest budget movies from the top 100 scoring 
# movies by taking advantage of the convenience methods: .nlargest and .nsmallest.

In [4]:
movie = pd.read_csv('../data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates ...,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark...,8.5,250000000.0
4,Star War...,7.1,


In [5]:
# Use the .nlargest method to select the top 100 movies by imdb_score

movie2.nlargest(100, 'imdb_score').head()

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering...,9.5,
1920,The Shaw...,9.3,25000000.0
3402,The Godf...,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxe...,9.1,17000000.0


In [6]:
# Chain the .nsmallest method to return the five lowest budget films among those with a top 100 score:

(movie2
  .nlargest(100, 'imdb_score')
  .nsmallest(5, 'budget')
)

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfl...,8.7,180000.0
4801,Children...,8.5,180000.0
4706,12 Angry...,8.9,350000.0
4550,A Separa...,8.4,500000.0
4636,The Othe...,8.4,500000.0


In [None]:
# The first parameter of the .nlargest method, n, must be an integer and selects the 
# number of rows to be returned. The second parameter, columns, takes a column name 
# as a string.

# Step 2 returns the 100 highest-scoring movies. We could have saved this intermediate 
# result as its own variable but instead, we chain the .nsmallest method to it in step 3, 
# which returns exactly five rows, sorted by budget.

In [None]:
# It is possible to pass a list of column names to the columns parameter of the .nlargest
# and .nsmallest methods. This would only be useful to break ties in the event that there
# were duplicate values sharing the nth ranked spot in the first column in the list.

## Selecting the largest of each group by sorting

In [None]:
# One of the most basic and common operations to perform during data analysis is to select
# rows containing the largest value of some column within a group. For instance, this would be
# like finding the highest-rated film of each year or the highest-grossing film by content rating.

# To accomplish this task, we need to sort the groups as well as the column used to rank each
# member of the group, and then extract the highest member of each group.

# In this recipe, we will find the highest-rated film of each year.

In [7]:
movie = pd.read_csv('../data/movie.csv')
movie[['movie_title', 'title_year', 'imdb_score']]

Unnamed: 0,movie_title,title_year,imdb_score
0,Avatar,2009.0,7.9
1,Pirates ...,2007.0,7.1
2,Spectre,2015.0,6.8
...,...,...,...
4913,A Plague...,2013.0,6.3
4914,Shanghai...,2012.0,6.3
4915,My Date ...,2004.0,6.6


In [8]:
# Use the .sort_values method to sort the DataFrame by title_year. 

# The default behavior sorts from the smallest to the largest. 

# Use the ascending=True parameter to invert this behavior:

(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values('title_year', ascending=True)
)

Unnamed: 0,movie_title,title_year,imdb_score
4695,Intolera...,1916.0,8.0
4833,Over the...,1920.0,4.8
4767,The Big ...,1925.0,8.3
...,...,...,...
4704,Revolution,,6.7
4752,Happy Va...,,8.5
4912,The Foll...,,7.5


In [None]:
# Notice how only the year was sorted. 

In [12]:
# To sort multiple columns at once, use a list.

(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values(['title_year','imdb_score'],
               ascending=False)
)

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxe...,2016.0,9.1
4277,A Beginn...,2016.0,8.7
3798,Airlift,2016.0,8.5
...,...,...,...
2165,Meet the...,,3.5
3246,The Bold...,,3.5
2119,The Bach...,,2.9


In [16]:
# We use the .drop_duplicates method to keep only the first row of every year:

(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values(['title_year','imdb_score'], ascending=False)
  .drop_duplicates(subset='title_year')
)

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxe...,2016.0,9.1
3745,Running ...,2015.0,8.6
4369,Queen of...,2014.0,8.7
...,...,...,...
4833,Over the...,1920.0,4.8
4695,Intolera...,1916.0,8.0
2725,Towering...,,9.5


### Group by Operations

In [17]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .groupby('title_year', as_index=False)
  .apply(lambda df: df.sort_values('imdb_score',
         ascending=False).head(1))
  .sort_values('title_year', ascending=False)
)

Unnamed: 0,Unnamed: 1,movie_title,title_year,imdb_score
90,4312,Kickboxe...,2016.0,9.1
89,3745,Running ...,2015.0,8.6
88,4369,Queen of...,2014.0,8.7
...,...,...,...,...
2,4767,The Big ...,1925.0,8.3
1,4833,Over the...,1920.0,4.8
0,4695,Intolera...,1916.0,8.0


In [None]:
# It is possible to sort one column in ascending order while simultaneously
# sorting another column in descending order. 

# To accomplish this, pass in a list of Booleans to the ascending
# parameter that corresponds to how you would like each column sorted. 

# The following sorts title_year and content_rating in descending order 
# and budget in ascending order.

# It then finds the lowest budget film for each year and content rating group:

In [18]:
(movie
  [['movie_title', 'title_year',
    'content_rating', 'budget']]
   .sort_values(['title_year',
       'content_rating', 'budget'],
       ascending=[False, False, True])
   .drop_duplicates(subset=['title_year',
        'content_rating'])
)

Unnamed: 0,movie_title,title_year,content_rating,budget
4026,Compadres,2016.0,R,3000000.0
4658,Fight to...,2016.0,PG-13,150000.0
4661,Rodeo Girl,2016.0,PG,500000.0
...,...,...,...,...
848,Stargate...,,TV-14,1400000.0
2436,Carlos,,Not Rated,
2119,The Bach...,,,3000000.0


## Replicating nlargest with sort_values

In [None]:
# The previous two recipes work similarly by sorting values in slightly 
# ifferent manners. Finding the top n values of a column of data is equivalent 
# to sorting the entire column in descending order and taking the first n values. 

# pandas has many operations that are capable of doing this in a variety of ways.

# In this recipe, we will replicate the Selecting the smallest of the largest 
# recipe with the .sort_values method and explore the differences between the two.

In [22]:
# Let's recreate the result from the final step of the Selecting the smallest 
# of the largest recipe:

movie = pd.read_csv('../data/movie.csv')
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .nlargest(100, 'imdb_score') 
   .nsmallest(5, 'budget')
)

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfl...,8.7,180000.0
4801,Children...,8.5,180000.0
4706,12 Angry...,8.9,350000.0
4550,A Separa...,8.4,500000.0
4636,The Othe...,8.4,500000.0


In [23]:
# Use .sort_values to replicate the first part of the expression and grab the first 
# 100 rows with the .head method

(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False)
   .head(100)
)

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering...,9.5,
1920,The Shaw...,9.3,25000000.0
3402,The Godf...,9.2,6000000.0
...,...,...,...
3935,Batman: ...,8.4,3500000.0
4636,The Othe...,8.4,500000.0
2455,Aliens,8.4,18500000.0


In [24]:
# Now that we have the top 100 scoring movies, we can use .sort_values with
# .head again to grab the lowest five by budget

(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False)
   .head(100) 
   .sort_values('budget')
   .head(5)
)

Unnamed: 0,movie_title,imdb_score,budget
4815,A Charli...,8.4,150000.0
4801,Children...,8.5,180000.0
4804,Butterfl...,8.7,180000.0
4706,12 Angry...,8.9,350000.0
4636,The Othe...,8.4,500000.0


In [None]:
# To understand why the two results are not equivalent, let's look at the 
# tail of the intermediate steps of each recipe:

In [25]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .nlargest(100, 'imdb_score')
   .tail()
)

Unnamed: 0,movie_title,imdb_score,budget
4023,Oldboy,8.4,3000000.0
4163,To Kill ...,8.4,2000000.0
4395,Reservoi...,8.4,1200000.0
4550,A Separa...,8.4,500000.0
4636,The Othe...,8.4,500000.0


In [26]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False) 
   .head(100)
   .tail()
)

Unnamed: 0,movie_title,imdb_score,budget
3799,Anne of ...,8.4,
3777,Requiem ...,8.4,4500000.0
3935,Batman: ...,8.4,3500000.0
4636,The Othe...,8.4,500000.0
2455,Aliens,8.4,18500000.0


In [None]:
# The issue arises because more than 100 movies exist with a rating of at least 8.4. 
# Each of the methods, .nlargest and .sort_values, breaks ties differently, which 
# results in a slightly different 100-row DataFrame. 

# If you pass in kind='mergsort' to the .sort_values method, you will get the same 
# result as .nlargest.

## Calculating a trailing stop order price

In [None]:
import datetime
#import pandas_datareader.data as web
import requests_cache
session = requests_cache.CachedSession(
   cache_name='cache', backend='sqlite', 
   expire_after=datetime.timedelta(days=90))

In [None]:
tsla = web.DataReader('tsla', data_source='yahoo',
   start='2017-1-1', session=session)
tsla.head(8)

In [None]:
tsla_close = tsla['Close']

In [None]:
tsla_cummax = tsla_close.cummax()
tsla_cummax.head()

In [None]:
(tsla
  ['Close']
  .cummax()
  .mul(.9)
  .head()
)