## CH-04: Beginning Data Analysis 

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 6, 'display.max_rows', 6, 'display.max_colwidth', 12)

## Introduction

## Developing a data analysis routine

### How to do it...

In [2]:
college = pd.read_csv('../data/college.csv')
college.sample(random_state=42)
college.head()


Unnamed: 0,INSTNM,CITY,STABBR,...,UG25ABV,MD_EARN_WNE_P10,GRAD_DEBT_MDN_SUPP
0,Alabama ...,Normal,AL,...,0.1049,30300,33888.0
1,Universi...,Birmingham,AL,...,0.2422,39700,21941.5
2,Amridge ...,Montgomery,AL,...,0.854,40100,23370.0
3,Universi...,Huntsville,AL,...,0.264,45500,24097.0
4,Alabama ...,Montgomery,AL,...,0.127,26600,33118.5


In [3]:
college.shape

(7535, 27)

In [4]:
college.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7535 entries, 0 to 7534
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   INSTNM              7535 non-null   object 
 1   CITY                7535 non-null   object 
 2   STABBR              7535 non-null   object 
 3   HBCU                7164 non-null   float64
 4   MENONLY             7164 non-null   float64
 5   WOMENONLY           7164 non-null   float64
 6   RELAFFIL            7535 non-null   int64  
 7   SATVRMID            1185 non-null   float64
 8   SATMTMID            1196 non-null   float64
 9   DISTANCEONLY        7164 non-null   float64
 10  UGDS                6874 non-null   float64
 11  UGDS_WHITE          6874 non-null   float64
 12  UGDS_BLACK          6874 non-null   float64
 13  UGDS_HISP           6874 non-null   float64
 14  UGDS_ASIAN          6874 non-null   float64
 15  UGDS_AIAN           6874 non-null   float64
 16  UGDS_N

In [15]:
college.describe(include=[np.number]).T

Unnamed: 0,count,mean,std,...,50%,75%,max
HBCU,7164.0,0.014238,0.118478,...,0.00000,0.000000,1.0
MENONLY,7164.0,0.009213,0.095546,...,0.00000,0.000000,1.0
WOMENONLY,7164.0,0.005304,0.072642,...,0.00000,0.000000,1.0
...,...,...,...,...,...,...,...
PCTPELL,6849.0,0.530643,0.225544,...,0.52150,0.712900,1.0
PCTFLOAN,6849.0,0.522211,0.283616,...,0.58330,0.745000,1.0
UG25ABV,6718.0,0.410021,0.228939,...,0.40075,0.572275,1.0


In [16]:
college.describe(include=[pd.Categorical]).T

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Alabama ...,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacyS...,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacyS...,1510


In [17]:
college.describe(include=[object]).T

Unnamed: 0,count,unique,top,freq
INSTNM,7535,7535,Alabama ...,1
CITY,7535,2514,New York,87
STABBR,7535,59,CA,773
MD_EARN_WNE_P10,6413,598,PrivacyS...,822
GRAD_DEBT_MDN_SUPP,7503,2038,PrivacyS...,1510


### How it works...

### There's more...

In [12]:
college.describe(include=[np.number],
   percentiles=[.01, .05, .10, .25, .5,
                .75, .9, .95, .99]).T

Unnamed: 0,count,mean,std,...,95%,99%,max
HBCU,7164.0,0.014238,0.118478,...,0.00000,1.000000,1.0
MENONLY,7164.0,0.009213,0.095546,...,0.00000,0.000000,1.0
WOMENONLY,7164.0,0.005304,0.072642,...,0.00000,0.000000,1.0
...,...,...,...,...,...,...,...
PCTPELL,6849.0,0.530643,0.225544,...,0.89636,0.993908,1.0
PCTFLOAN,6849.0,0.522211,0.283616,...,0.89792,0.986368,1.0
UG25ABV,6718.0,0.410021,0.228939,...,0.80000,0.917383,1.0


## Data dictionaries

In [14]:
pd.read_csv('../data/college_data_dictionary.csv')

Unnamed: 0,column_name,description
0,INSTNM,Institut...
1,CITY,City Loc...
2,STABBR,State Ab...
...,...,...
24,UG25ABV,Percent ...
25,MD_EARN_...,Median E...
26,GRAD_DEB...,Median d...


## Reducing memory by changing data types

### How to do it...

In [None]:
college = pd.read_csv('data/college.csv')
different_cols = ['RELAFFIL', 'SATMTMID', 'CURROPER',
   'INSTNM', 'STABBR']
col2 = college.loc[:, different_cols]
col2.head()

In [None]:
col2.dtypes

In [None]:
original_mem = col2.memory_usage(deep=True)
original_mem

In [None]:
col2['RELAFFIL'] = col2['RELAFFIL'].astype(np.int8)    

In [None]:
col2.dtypes

In [None]:
college[different_cols].memory_usage(deep=True)

In [None]:
col2.select_dtypes(include=['object']).nunique()

In [None]:
col2['STABBR'] = col2['STABBR'].astype('category')
col2.dtypes

In [None]:
new_mem = col2.memory_usage(deep=True)
new_mem

In [None]:
new_mem / original_mem

### How it works...

### There's more...

In [None]:
college.loc[0, 'CURROPER'] = 10000000
college.loc[0, 'INSTNM'] = college.loc[0, 'INSTNM'] + 'a'
college[['CURROPER', 'INSTNM']].memory_usage(deep=True)

In [None]:
college['MENONLY'].dtype

In [None]:
college['MENONLY'].astype(np.int8)

In [None]:
college.assign(MENONLY=college['MENONLY'].astype('float16'),
    RELAFFIL=college['RELAFFIL'].astype('int8'))

In [None]:
college.index = pd.Int64Index(college.index)
college.index.memory_usage() # previously was just 80

## Selecting the smallest of the largest

### How to do it...

In [18]:
movie = pd.read_csv('../data/movie.csv')
movie2 = movie[['movie_title', 'imdb_score', 'budget']]
movie2.head()

Unnamed: 0,movie_title,imdb_score,budget
0,Avatar,7.9,237000000.0
1,Pirates ...,7.1,300000000.0
2,Spectre,6.8,245000000.0
3,The Dark...,8.5,250000000.0
4,Star War...,7.1,


In [20]:
# Use the .nlargest method to select the top 100 movies by imdb_score

movie2.nlargest(100, 'imdb_score').head()

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering...,9.5,
1920,The Shaw...,9.3,25000000.0
3402,The Godf...,9.2,6000000.0
2779,Dekalog,9.1,
4312,Kickboxe...,9.1,17000000.0


In [21]:
# Chain the .nsmallest method to return the five lowest budget films among those with a top 100 score:

(movie2
  .nlargest(100, 'imdb_score')
  .nsmallest(5, 'budget')
)

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfl...,8.7,180000.0
4801,Children...,8.5,180000.0
4706,12 Angry...,8.9,350000.0
4550,A Separa...,8.4,500000.0
4636,The Othe...,8.4,500000.0


### How it works...

### There's more...

## Selecting the largest of each group by sorting

### How to do it...

In [22]:
movie = pd.read_csv('../data/movie.csv')
movie[['movie_title', 'title_year', 'imdb_score']]

Unnamed: 0,movie_title,title_year,imdb_score
0,Avatar,2009.0,7.9
1,Pirates ...,2007.0,7.1
2,Spectre,2015.0,6.8
...,...,...,...
4913,A Plague...,2013.0,6.3
4914,Shanghai...,2012.0,6.3
4915,My Date ...,2004.0,6.6


In [23]:
# Use the .sort_values method to sort the DataFrame by title_year. The default behavior sorts from the smallest to the largest. 
# Use the ascending=True parameter to invert this behavior:

(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values('title_year', ascending=False)
)

Unnamed: 0,movie_title,title_year,imdb_score
3884,The Veil,2016.0,4.7
2375,My Big F...,2016.0,6.1
2794,Miracles...,2016.0,6.8
...,...,...,...
4704,Revolution,,6.7
4752,Happy Va...,,8.5
4912,The Foll...,,7.5


In [29]:
# To sort multiple columns at once, use a list.

(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values(['title_year','imdb_score'],
               ascending=False)
)

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxe...,2016.0,9.1
4277,A Beginn...,2016.0,8.7
3798,Airlift,2016.0,8.5
...,...,...,...
2165,Meet the...,,3.5
3246,The Bold...,,3.5
2119,The Bach...,,2.9


In [30]:
# We use the .drop_duplicates method to keep only the first row of every year:

(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .sort_values(['title_year','imdb_score'],
               ascending=False)
  .drop_duplicates(subset='title_year')
)

Unnamed: 0,movie_title,title_year,imdb_score
4312,Kickboxe...,2016.0,9.1
3745,Running ...,2015.0,8.6
4369,Queen of...,2014.0,8.7
...,...,...,...
4833,Over the...,1920.0,4.8
4695,Intolera...,1916.0,8.0
2725,Towering...,,9.5


### How it works... Group by Operations

##  There's more...

In [31]:
(movie
  [['movie_title', 'title_year', 'imdb_score']]
  .groupby('title_year', as_index=False)
  .apply(lambda df: df.sort_values('imdb_score',
         ascending=False).head(1))
  .sort_values('title_year', ascending=False)
)

Unnamed: 0,Unnamed: 1,movie_title,title_year,imdb_score
90,4312,Kickboxe...,2016.0,9.1
89,3745,Running ...,2015.0,8.6
88,4369,Queen of...,2014.0,8.7
...,...,...,...,...
2,4767,The Big ...,1925.0,8.3
1,4833,Over the...,1920.0,4.8
0,4695,Intolera...,1916.0,8.0


In [None]:
(movie
  [['movie_title', 'title_year',
    'content_rating', 'budget']]
   .sort_values(['title_year',
       'content_rating', 'budget'],
       ascending=[False, False, True])
   .drop_duplicates(subset=['title_year',
        'content_rating'])
)

## Replicating nlargest with sort_values

### How to do it...

In [32]:
# Use .sort_values to replicate the first part of the expression and grab the first 100 rows with the .head method

movie = pd.read_csv('../data/movie.csv')
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .nlargest(100, 'imdb_score') 
   .nsmallest(5, 'budget')
)

Unnamed: 0,movie_title,imdb_score,budget
4804,Butterfl...,8.7,180000.0
4801,Children...,8.5,180000.0
4706,12 Angry...,8.9,350000.0
4550,A Separa...,8.4,500000.0
4636,The Othe...,8.4,500000.0


In [33]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False)
   .head(100)
)

Unnamed: 0,movie_title,imdb_score,budget
2725,Towering...,9.5,
1920,The Shaw...,9.3,25000000.0
3402,The Godf...,9.2,6000000.0
...,...,...,...
3935,Batman: ...,8.4,3500000.0
4636,The Othe...,8.4,500000.0
2455,Aliens,8.4,18500000.0


In [34]:
# Now that we have the top 100 scoring movies, we can use .sort_values with .head again to grab the lowest five by budget

(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False)
   .head(100) 
   .sort_values('budget')
   .head(5)
)

Unnamed: 0,movie_title,imdb_score,budget
4815,A Charli...,8.4,150000.0
4801,Children...,8.5,180000.0
4804,Butterfl...,8.7,180000.0
4706,12 Angry...,8.9,350000.0
4636,The Othe...,8.4,500000.0


### How it works...

In [35]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .nlargest(100, 'imdb_score')
   .tail()
)

Unnamed: 0,movie_title,imdb_score,budget
4023,Oldboy,8.4,3000000.0
4163,To Kill ...,8.4,2000000.0
4395,Reservoi...,8.4,1200000.0
4550,A Separa...,8.4,500000.0
4636,The Othe...,8.4,500000.0


In [36]:
(movie
   [['movie_title', 'imdb_score', 'budget']]
   .sort_values('imdb_score', ascending=False) 
   .head(100)
   .tail()
)

Unnamed: 0,movie_title,imdb_score,budget
3799,Anne of ...,8.4,
3777,Requiem ...,8.4,4500000.0
3935,Batman: ...,8.4,3500000.0
4636,The Othe...,8.4,500000.0
2455,Aliens,8.4,18500000.0


## Calculating a trailing stop order price

### How to do it...

In [38]:
import datetime
#import pandas_datareader.data as web
import requests_cache
session = requests_cache.CachedSession(
   cache_name='cache', backend='sqlite', 
   expire_after=datetime.timedelta(days=90))

ModuleNotFoundError: No module named 'requests_cache'

In [None]:
tsla = web.DataReader('tsla', data_source='yahoo',
   start='2017-1-1', session=session)
tsla.head(8)

In [None]:
tsla_close = tsla['Close']

In [None]:
tsla_cummax = tsla_close.cummax()
tsla_cummax.head()

In [None]:
(tsla
  ['Close']
  .cummax()
  .mul(.9)
  .head()
)

### How it works...

### There's more...