# User integration with Jupyter
Ref: [StackOverflow: 31610889](https://stackoverflow.com/questions/31610889/how-to-copy-paste-dataframe-from-stackoverflow-into-python)

## Make a Jupyter output data into a python dataframe

In [None]:
# Ctrl+C the table output (say 0-3 records of the table above)
# Run this...
import pandas as pd

df_from_pd = pd.read_clipboard()
df_from_pd

## Convert Table from Excel via clipboard into a python dictionary array

In [None]:
# Copy the data of interest - with headers - into clipboard with Ctrl+C
# Run this...
import pandas as pd

## Copy the Excel table to the clipboard first.
the_dict = pd.read_clipboard().to_dict('records')
the_dict

In [None]:
# The clipboard from Excel has the following:
pd.read_clipboard()

# Filtering rows based on conditions from other columns
**Ref**: Data School Videos:
1. [How do I filter rows of a pandas DataFrame by column value?](https://www.youtube.com/watch?v=2AFGPdNn4FM)
2. [How do I apply multiple filter criteria to a pandas DataFrame?](https://www.youtube.com/watch?v=YPItfQ87qjM&t=5s)
3. [loc / iloc How do I select multiple rows and columns from a pandas DataFrame?](https://www.youtube.com/watch?v=xvpNA7bC8cs&t=488s)


In [None]:
import pandas as pd
drinks = pd.read_csv('http://bit.ly/drinksbycountry')

# Single column condition
drinks[drinks["continent"] == 'Asia']

# Multiple conditions from a single column
drinks[drinks.continent.isin(['Asia', 'Africa'])]   #!!! BE MINDFUL OF TEXT WITH SPACES.

### Criteria from multiple columns
# Single conditions per column
drinks[(drinks["continent"] == 'Asia') & (drinks["beer_servings"] > 100)]

# Mix of single and multiple conditions per column
# Beer servings > 100 in Asia or Africa, but not in Vietnam
drinks[drinks.continent.isin(['Asia', 'Africa']) & (drinks.beer_servings > 100) & ~(drinks.country == 'Vietnam')]

In [None]:
# Multiple sub-string search
# Countries in Asia and Africa having the letter 's' or 'w' in them with beer_servings > 100
continent = ['Asia', 'Africa']
searchfor = ['s', 'w']
drinks[drinks.country.str.contains('|'.join(searchfor), case=False) & 
       drinks.continent.isin(continent) & 
       (drinks.beer_servings > 100)]

# Merging 2D x 1D arrays of different length

Ref: [Stackoverflow: 30598281](https://stackoverflow.com/questions/30597260/merging-a-dataframe-with-a-series/30598281)

The task is to repeat rows in the 2D array for values in the 1D array

In [None]:
## Merging a 2 dimensional master array with a one dimensional date series

import pandas as pd
import numpy as np
from datetime import date

# Create a 3 x 4 master dataframe
df = pd.DataFrame(np.random.randn(3,4), columns = list('ABCD'))
df

In [None]:
# Insert a key into the master dataframe
df['key'] = 0
df

In [None]:
# Create a 2 x 1 date series
dates = pd.date_range(date.today(), periods=2)
dates

In [None]:
# make the date series into a dataframe with the key 
ser = pd.DataFrame({'By': dates, 'key':[0] * len(dates)})
ser

In [None]:
# merge the master dataframe and the dataseries dataframe over the key and drop the key. 
result = pd.merge(df, ser, on = 'key').drop('key', axis = 1)
result

# Lookup if date is between two dates from another dataframe
The following lookup code evaluates if dates are between two dates and extracts the associated text (Weeknumber)

In [None]:
import pandas as pd, numpy as np
dates = pd.date_range('20180101', periods=21)
week_start = pd.date_range('20180101', periods=3, freq='W-Mon')
week_end = pd.date_range('20180101', periods=3, freq='W-Sun')
week = pd.Series(['W1', 'W2', 'W3'])
df1 = pd.DataFrame({'By': dates, 
                    'SerNo': np.random.randint(5, size=21)})
df2 = pd.DataFrame({'Start': week_start,
                    'End': week_end,
                    'Week': week})

In [None]:
# Text of the week
week

In [None]:
# Weekends
week_end

In [None]:
# Dataframe of dates (contains 21 values)
df1.loc[0:8,['SerNo', 'By']]

In [None]:
# Dataframe of weekly buckets
df2[['Start', 'End', 'Week']]

In [None]:
# Array with Interval index of the weeks
idx = pd.IntervalIndex.from_arrays(df2.Start, df2.End, closed='both')
idx

In [None]:
week = df2.loc[idx.get_indexer(df1.By), 'Week']
week[0:10]

In [None]:
df1['Week'] = week.values
df1.loc[0:10, ['SerNo', 'By', 'Week']]

# Lookup between two arrays and add records to master
Ref: [StackOverflow: 46597513](https://stackoverflow.com/questions/46597513/splitting-order-quantities-by-type-and-scoop)

In [None]:
import pandas as pd
import numpy as np # This is required for indexing to ignore. Find+Replace nan to np.nan

ask = [{'Date': '6-Oct-17', 'Qty': 80.0, 'Scoop': 'Single', 'Type': 'A'},
 {'Date': '10-Oct-17', 'Qty': 90.0, 'Scoop': 'Triple', 'Type': 'B'},
 {'Date': '9-Oct-17', 'Qty': 40.0, 'Scoop': 'Double', 'Type': 'D'},
 {'Date': '10-Oct-17', 'Qty': 20.0, 'Scoop': 'Double', 'Type': 'C'},
 {'Date': '10-Oct-17', 'Qty': 90.0, 'Scoop': 'Triple', 'Type': 'B'},
 {'Date': '9-Oct-17', 'Qty': 30.0, 'Scoop': 'Single', 'Type': 'A'}]

ask = pd.DataFrame(ask)
ask

In [None]:
icecream = [{'Flavour1': 'Strawberry',
  'Flavour2': np.nan,
  'Flavour3': np.nan,
  'Proportion': 0.25,
  'Scoop': 'Single',
  'Scoops/Tub': 4,
  'Type': 'A'},
 {'Flavour1': 'Banana',
  'Flavour2': 'Lemon',
  'Flavour3': np.nan,
  'Proportion': 0.25,
  'Scoop': 'Double',
  'Scoops/Tub': 2,
  'Type': 'C'},
 {'Flavour1': 'Vanilla',
  'Flavour2': 'Lemon',
  'Flavour3': 'Mint',
  'Proportion': 0.11,
  'Scoop': 'Triple',
  'Scoops/Tub': 3,
  'Type': 'B'},
 {'Flavour1': 'Chocolate',
  'Flavour2': 'Vanilla',
  'Flavour3': np.nan,
  'Proportion': 0.1,
  'Scoop': 'Double',
  'Scoops/Tub': 5,
  'Type': 'D'}]

icecream = pd.DataFrame(icecream)
icecream

In [None]:
tub=ask.merge(icecream.drop('Scoop',1),on='Type',how='left')
tub

In [None]:
tub=tub.set_index(['Date','Type','Scoop','Qty','Scoops/Tub','Proportion']).stack().reset_index()
tub

In [None]:
tub['Qty']=tub['Qty']*tub['Proportion']
tub

In [None]:
tub=tub.drop(['Scoops/Tub','Proportion','Scoop'],1).rename(columns={'level_6':'Scoop',0:'Flavour'})
tub

# Check if a date is inside or outside a specified date range

In [None]:
# Check if a date is inside or outside a specified date range column
import pandas as pd
import numpy as np
df = pd.DataFrame({'A': pd.date_range('20170101', periods=10),
                    'B': pd.date_range('20170101', '20170310', freq="W-Fri"),
                    'C': pd.Timestamp('20170108')}); df
                
# df['Inside'] = np.where( (df['B'] > df['A']) & (df['B'] < df['C']), 'In' , 'Out'); df
df['Inside'] = np.where( (df['B'] > df['A']) & (df['B'] < df['C']), df['B'] - df['A'] , df['A'] - df['A']); df

# Build arrays for sample data

In [None]:
import numpy as np
np.empty((3,2))

In [None]:
np.full((2,2),7)

In [None]:
np.arange(10,25,5)

In [None]:
np.linspace(0,2,9)

In [None]:
np.arange(0,2,9)

In [None]:
np.identity(5)

In [None]:
np.random.random((5,5))*np.identity(5)

In [None]:
np.random.lognormal(mean=0, sigma=1, size=5)

# Group By

In [None]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                           'foo', 'bar', 'foo', 'foo'],
                    'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                    'C' : np.random.randn(8),
                    'D' : np.random.randn(8)})

grouped = df.groupby(['A', 'B'])
grouped.last()

## Getting variable number of pandas rows w.r.t. a dictionary lookup

In [None]:
import pandas as pd
import numpy as np
import random, string

max_rows = {'A': 3, 'B': 2, 'D': 4} # max number of rows to be extracted

data_size = 1000

df = pd.DataFrame({'symbol': pd.Series(random.choice(string.ascii_uppercase) 
                                       for _ in range(data_size)),
              'qty': np.random.randn(data_size)}).sort_values('symbol')

In [None]:
pd.concat([df.loc[df["symbol"].eq(k)].head(v) for k,v in max_rows.items()])

# Extract Ticker symbols of S&P 500

In [None]:
import pandas as pd

url = https.urlopen('GET', 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
symbols_table = pd.read_html(url.data, header=0)[0]
symbols = list(symbols_table.loc[:, "Ticker symbol"])
symbols_table.head()

# Classes
Best practice on classes and OOP in python (Ref: [jeffknupp.com](https://jeffknupp.com/blog/2014/06/18/improve-your-python-python-classes-and-object-oriented-programming/))

In [None]:
class Customer(object):
    """A customer of ABC Bank with a checking account. Customers have the 
    following properites:
    
    Attributes:
       name: A string representing the customer's name.
       balance: A float tracking the current balance of the customer's account
    
    """
    
    def __init__(self, name, balance=0.0):
        """Return a Customer object whose name is *name* and starting 
        balance is *balance*."""
        self.name = name
        self.balance = balance
        
    def withdraw(self, amount):
        """Return the balance remaining after withdrawing *amount* 
        dollars."""
        if amount > self.balance:
            raise RuntimeError('Amount greater than available balance.')
        self.balance -= amount
        return self.balance
    
    def deposit(self, amount):
        """Return the balance remaining after depositing *amount*
        dollars."""
        self.balance += amount
        return self.balance       
    

In [None]:
# To call (instantiate) the class
jeff = Customer('Jeff Knupp', 1000.0)    #jeff is the object, which is an iinstance of the *Customer* class

In [None]:
# The *self* parameter in *Customer* methods performs the given instructions.
# For e.g. to withdraw

jeff.withdraw(100.0)   # Instruction to withdraw
jeff.balance           # Shows 900.0

In [None]:
# Another way to withdraw is by using the class name itself as follows:
Customer.withdraw(jeff, 200.0)
jeff.balance           # Shows 700.0

In [None]:
class Car(object):
    """ A car with wheels, make and model
    
    Usage::     Car(make, model)
    
    Attr:
       make: A string representing car company
       model: a string representing the model of the car
    
    """
    wheels = 4
    
    def __init__(self, make, model):
        """Returns a Car object whose company is *make* and model is *model*"""
        self.make = make
        self.model = model

mustang = Car("Ford", "Mustang")
mustang.make

In [None]:
mustang.model

In [None]:
mustang.wheels

In [None]:
Car.wheels

In [None]:
class Car(object):
    """ A car with wheels, make and model
    
    Usage::     Car(make, model)
    
    Attr:
       make: A string representing car company
       model: a string representing the model of the car
    
    """
    wheels = 4
    
    def __init__(self, make, model):
        """Returns a Car object whose company is *make* and model is *model*"""
        self.make = make
        self.model = model
    
    @staticmethod
    def make_car_sound():
        print('Vrooooooooom!')

mustang = Car("Ford", "Mustang")
mustang.make

In [None]:
Car.make_car_sound()

In [None]:
class Vehicle(object):
    """ A vehicle with wheels make and model
    
    Usage:: Vehicle(wheels, miles, make, model, year, sold_on)
    
    Attr:
       wheels: An integer representing the number of wheels
       miles: An integer with number of miles
       make: A string representing car company
       model: A string representing the model of the car
       year: An integer year when the car was built
       sold_on: Date when the vehicle was sold
    
    """
    
    def __init__(self, wheels, miles, make, model, year, sold_on):
        """Return a new Vehicle object"""
        self.wheels = wheels
        self.miles = miles
        self.make = make
        self.model = model
        self.year = year
        self.sold_on = sold_on
        
    def sale_price(self):
        """Return the sale price for this vehicle as a float amount"""
        if self.sold_on is not None:
            return 0.0 # Already sold
        return 5000.0 * self.wheels
    
    def purchase_price(self):
        """Return the price for which we would pay to purchase the vehicle"""
        if self.sold_on is None:
            return 0.0  # Not yet sold
        return 8000 - (.10 * self.miles)
        

Instantiate the vehicle (Still not DRY code !!!). Also shouldn't let Vehicle to be created. Only Cars and Trucks should be creatable.

In [None]:
class Car(Vehicle):
    
    def __init__(self, wheels, miles, make, model, year, sold_on):
        """Return a new Car object"""
        self.wheels = wheels
        self.miles = miles
        self.make = make
        self.model = model
        self.year = year
        self.sold_on = sold_on
        self.base_sale_price = 8000
        
class Truck(Vehicle):
    
    def __init__(self, wheels, miles, make, model, year, sold_on):
        """Return a new Truck object"""
        self.wheels = wheels
        self.miles = miles
        self.make = make
        self.model = model
        self.year = year
        self.sold_on = sold_on
        self.base_sale_price = 10000

In [None]:
v = Vehicle(4, 0, 'Honda', 'Accord', 2014, None)

In [None]:
v.purchase_price()

# Abstract Base Class (ABC)
Use Abstract Base Class to abstract away some common data and behaviour.

In [None]:
from abc import ABCMeta, abstractmethod

class Vehicle(object):
    """A vehicle for sale by Kashi's Dealership
    
    Usage:: 
    
    Attr:
       wheels: No of wheels of the vehicle - Integer
       miles: No of miles driven on vehicle - Integer
       make: Manufacturer of the vehicle - String
       model: Model of the vehicle - String
       year: Year when the vehicle was built - Integer
       sold_on: Date when vehicle was sold - Date
    
    """
    
    __metaclass__ = ABCMeta
    
    base_sale_price = 0
    wheels = 0
    
    def __init__(self, miles, make, model, year, sold_on):
        """ Returns a new Vehicle object"""
        self.miles = miles
        self.make = make
        self.model = model
        self.year = year
        self.sold_on = sold_on
        
    def sale_price(self):
        """Return the sale price for the vehicle - Float"""
        if self.sold_on is not None:
            return 0.0 # Already sold
        return 5000.0 * self.wheels
    
    def purchase_price(self):
        """Return the price we would pay to purchase the vehicle - Float"""
        if self.sold_on is None:
            return 0.0 # Not yet sold
        return self.base_sale_price - (0.10 * self.miles)
    
    
    @abstractmethod
    def vehicle_type(self):
        """Returns type of vehicle - String"""
        pass        
        
        

Now the *Car* and *Truck* classes become:

In [None]:
class Car(Vehicle):
    """A car for sale by Kashi's dealership"""
    
    base_sale_price = 8000
    wheels = 4
    
    def vehicle_type(self):
        """Return a string representing type of this vehicle - String"""
        return 'car'
    
class Truck(Vehicle):
    """A truck for sale by Kashi's dealership"""
    
    base_sale_price = 10000
    wheels = 4
    
    def vehicle_type(self):
        """Return a string representing type of this vehicle - String"""
        return 'truck'
        

In [None]:
class Motorcycle(Vehicle):
    """A motorcycle for sale by Kashi's dealership"""
    
    base_sale_price = 4000
    wheels = 2
    
    def vehicle_type(self):
        """Return a string representing type of this vehicle - String"""
        return "motorcycle"

In [None]:
mc = Motorcycle(make='Honda', miles=2000, model='Hawk', sold_on="01-Feb-2008",year=2007)
mc.vehicle_type()

# Profiling Python Code
[Easy Python Profiling](http://mortada.net/easily-profile-python-code-in-jupyter.html)


Profiling Python code can be done by:
1. %%time - for the whole code
2. %%timeit - for repeated execution of single lines - or entire code. This doesn't give output
3. %load_ext line_profiler
+ %lprun -f function_name function_name(arguments)

# Printing lexed contents of a python file in Jupyter

Uses [Pygments](http://pygments.org/docs/quickstart/) Syntax highlighter
<p><b>Note:</b> This has been put as a function in _utilities.py_</p>

In [None]:
from pygments import highlight
from pygments.lexers import PythonLexer
from pygments.formatters import HtmlFormatter
import IPython

def display_py(code):
    """Displays python file code in Jupyter
    
    Arg: (srting from py file) code
    
    Output: code formatted for jupyter
    
    Usage: with open(myfile) as f:
                code = f.read()
                
           display_py(code)
    """
    formatter = HtmlFormatter()
    
    html_code = highlight(code, PythonLexer(), HtmlFormatter())
    styled_html = '<style type="text/css">{}</style>{}'.format(formatter.get_style_defs('.highlight'), html_code)
    ipython_code = IPython.display.HTML(styled_html)
    
    return ipython_code
    
with open('add_two_numbers.py') as f:
    code = f.read()
    
display_py(code)

# List comprehensions

## Running ib_insync code in blocks

In [None]:
for i in range(0, len(options), 100):
    for t in ib.reqTickers(*options[i:i+100]):
        print(t)

...in list comprehension

In [None]:
[t for i in range(0, len(options), 100) for t in ib.reqTickers(*options[i:i+100])]

## Catching errors in list comprehension
Ref: [Stack Overflow: 1528237](https://stackoverflow.com/a/8915613/7978112)

In [None]:
def catch(func, handle=lambda e : e, *args, **kwargs):
    try:
        return func(*args, **kwargs)
    except Exception as e:
        return handle(e)

In [None]:
# In the list comprehension
eggs = (1,3,0,3,2)
[catch(lambda : 1/egg) for egg in eggs]

## Make dataframes from a nested list
Also used for making a dataframe from 3 lists of unequal lengths

In [None]:
nested_list = [('R1',
  {'a', 'b', 'c'},
  {20.0,   40.0,   50.0,   60.0,   750.0}),
 ('R2',
  {'x', 'y', 'z'},
  {35.0,   37.5,   165.0}), 
 ('R3',
  {'x', 'a', 'm'},
  {2.5,   5.0,   7.5,   10.0,   12.5,   45.0})]

nested_list

In [None]:
from  itertools import product

L = [[[x[0]], sorted(x[1]), sorted(x[2])] for x in nested_list]
pd.DataFrame([j for i in L for j in product(*i)], columns=['Cat','Column','Value']).head()

## Converting lambda into list comprehension

In [None]:
import pandas as pd
df = pd.DataFrame({'a': 1, 'b': range(4)})
df

In [None]:
def sumthis(a, b):
    return a+b

list(map(lambda x, y: sumthis(x, y), [i for i in df.a], [j for j in df.b]))

In [None]:
# In list cmprehension, zip is used:
[sumthis(x, y) for x, y in zip(df.a, df.b)]

# Lookup and Replace / Map data between a main dataframe and a lookup dataframe
Ref: [StackOverflow](https://stackoverflow.com/questions/43716045)

In [None]:
main_df = pd.DataFrame({'A': list('abcd'),
                        'B': list('lmno'),
                        'C': list('efgh'),
                        'D': list('qrst')})
main_df

In [None]:
lookup_df = pd.DataFrame({'X': range(4),
                          'Lookup': list('ghqr'),
                          'Val': [10,15,20,30]})

lookup_df

In [None]:
# Set index of the lookup_df to Lookup column
lookup_df = lookup_df.set_index('Lookup')
lookup_df

In [None]:
# Map column C to replace its value with the lookup
main_df.C = main_df.C.map(lookup_df.X)
main_df

In [None]:
# Using replace
main_df.replace(main_df.D, main_df.D.map(lookup_df.X), inplace=True)

In [None]:
main_df

# Computing volatility and comparing it with standard deviation
 Ref: [Motley Fool](https://www.fool.com/knowledge-center/how-to-calculate-annualized-volatility.aspx)

In [None]:
import pandas as pd
from math import sqrt
Ser = pd.Series([1972.18, 1988.87, 1987.66, 1940.51, 1861.61, 1893.21, 1970.81, 2035.73, 2079.61, 2096.92, 
                 2102.44, 2091.54, 2083.39, 2086.05, 2084.07, 2104.18, 2077.57, 2083.56, 2099.84, 2093.32, 2098.04])
Ser.expanding(1).std(ddof=1)  #outputs the rolling standard deviation

In [None]:
# Get the percent change
Ser.pct_change()

In [None]:
# Get the standard deviation of the percent change
Ser.pct_change().std(ddof=0)  # This is 1.7%

In [None]:
# To get annualized volatility, multiply by no of trading days
Ser.pct_change().std(ddof=0)*sqrt(252)  # This gives 27% annual volaility

In [None]:
# To get continuous annual volaility
Ser.pct_change().expanding(1).std(ddof=0)*sqrt(252)

# Removing empty lists / dicts [ ] / { } and nan from a dictionary

In [None]:
import pandas as pd
import numpy as np

values = [dict(), [], 5, np.nan, 'a']*2
keys = ['Key'+ str(i+1) for i in range(len(values))]

my_dict = {}
for i, key in enumerate(keys):
    my_dict[key] = values[i]

my_dict

In [None]:
# This one gives a type error!
[v for k, v in my_dict.items if v if str(v) != 'nan']

In [None]:
# This is the correct way. First remove empty items, then check if the string has 'nan'
{i: j for i, j in {k: v for k, v in my_dict.items() if v}.items() if str(j) != 'nan'}

# Measuring Wall Time in between programs
Ref: [Stackoverflow](https://stackoverflow.com/a/14452178/7978112)

In [None]:
from time import time, sleep
start = time()

# put the code here
sleep(2)

codetime = time() - start

m, s = divmod(codetime,60)
h, m = divmod(m, 60)

print('{:d}:{:02d}:{:02d}'.format(int(h), int(m), int(s)))

# Making interactive pivotcharts from pandas dataframes
(Ref: [https://towardsdatascience.com/two-essential-pandas-add-ons-499c1c9b65de](https://towardsdatascience.com/two-essential-pandas-add-ons-499c1c9b65de))

In [None]:
import pandas as pd
from pivottablejs import pivot_ui
from IPython.core.display import HTML

df = pd.read_csv('http://bit.ly/imdbratings')
pivot_ui(df,outfile_path="./data/pivottablejs.html")
HTML("pivottablejs.html")

# Requests Check

In [None]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
import requests
r = requests.get('https://www.python.org')
r

# Filtering rows by column value

In [None]:
import pandas as pd
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()

In [None]:
movies.shape

In [None]:
type(False)

In [None]:
# Create python list of booleans with the same length of the dataframe. 
#Boolean will be true if it is > 200 mins and false if others
booleans = []
for length in movies.duration:
    if length >= 200:
        booleans.append(True)
    else:
        booleans.append(False)

In [None]:
booleans[0:5]

In [None]:
len(booleans)

In [None]:
# convert booleans list to a pandas series
is_long = pd.Series(booleans)

In [None]:
is_long.head()

In [None]:
# pass is_long to the dataframe movies with bracket notation
movies[is_long].head()

#It shows up dataframe with all columns but only shows those with duration > 200 mins

In [None]:
# There is a shorter way - without the for loop.

#Instead:

In [None]:
# This completely replaces the need for a 'for' loop.
is_long = movies.duration >= 200 # Series (movies.duration)  ... comparison >=200 and returns series of trues and falses
is_long.head()

In [None]:
movies[is_long].head()

In [None]:
# We can eliminate is_long itself
movies[movies.duration >=200].head()

In [None]:
# If we are studying only the genre of the movies
# you can use dot notation

movies[movies.duration >=200].genre

# or in bracket notations
movies[movies['duration'] >=200]['genre']

#The above code may sometimes cause strange behaviour. Not the best way to do things.
#better practice is to use the .loc method

In [None]:
# use .loc with a comma
movies.loc[movies['duration'] >=200, 'genre']

#.loc allows selection of rows and columns by 'label'
# in the above movies['duration'] >= 200 are the rows and 'genre' are the columns

## ...For Multiple filter criteria

In [None]:
import pandas as pd
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()

In [None]:
movies[movies.duration >= 200]

In [None]:
# how do we select only long movies of genre Drama?

In [None]:
True and True

In [None]:
movies[movies.duration >= 200 and movies.genre == 'Drama'] # will not work

In [None]:
# add parenthesis to add evaluation order
# add ampersand & instead of and

movies[(movies.duration >= 200) & (movies.genre == 'Drama')]

In [None]:
# or condition - gives a much bigger dataframe
movies[(movies.duration >= 200) | (movies.genre == 'Drama')].head()
movies[(movies.duration >= 200) | (movies.genre == 'Drama')].shape

In [None]:
# inside the bracket there is a boolean series which tells dataframe which rows display
((movies.duration >= 200) & (movies.genre == 'Drama'))[:5]

In [None]:
# what if there is a bunch of or conditions on the same series
# either crime or drama or action
# normally
movies[(movies.genre == 'Crime') | (movies.genre == 'Drama') | (movies.genre == 'Action')].head()

In [None]:
# the above is very wordy.
# to simplify we can use a series method called 'isin'
# it generates a boolean series
movies.genre.isin(['Crime', 'Drama', 'Action'])[:5]

In [None]:
# The above series can be passed to the DataFrame
movies[movies.genre.isin(['Crime', 'Drama', 'Action'])].head()

# Web Scraping

In [None]:
%%time
# Scraping all HTMLs table from a URL using BeautifulSoup
import pandas as pd
import requests
from bs4 import BeautifulSoup

res = requests.get("https://www.5paisa.com/5pit/spma.asp", verify=False)
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))
df

In [None]:
df

In [None]:
# %%time
# Scraping an HTML table into pandas
import pandas as pd
url = "https://www.5paisa.com/5pit/spma.asp"
df = pd.read_html(url)[1]
df

In [None]:
from lxml import etree, html
import requests

url = "https://finance.google.com/finance?q=NSE:PFC"

page = requests.get(url)
root = html.fromstring(page.content)
dividend = float(root.findall('.//table')[2].text_content().strip().split("\n")[2].split('/')[0])
dividend

# Use of loc and iloc

In [None]:
## Pandas Index
## ...from Data School - ref: https://www.youtube.com/watch?v=OYZNk7Z9s6I
import pandas as pd
drinks = pd.read_csv('http://bit.ly/drinksbycountry')

In [None]:
drinks.head()

In [None]:
drinks.index

In [None]:
drinks.columns

In [None]:
drinks.shape

In [None]:
pd.read_table('http://bit.ly/movieusers', header = None, sep = '|').head()

In [None]:
drinks[drinks.continent == 'South America']

In [None]:
drinks.loc[23, 'beer_servings']

In [None]:
drinks.set_index('country', inplace=True)
drinks.head()

In [None]:
drinks.index

In [None]:
drinks.columns

In [None]:
drinks.shape

In [None]:
drinks.loc['Brazil', 'beer_servings']

In [None]:
drinks.index.name = None
drinks.head()

In [None]:
drinks.index.name = 'country'

In [None]:
drinks.reset_index(inplace = True)

In [None]:
drinks.head()

In [None]:
drinks.describe()

In [None]:
drinks.describe().loc['25%', 'beer_servings']

In [None]:
drinks.head()

In [None]:
drinks.continent.head()

In [None]:
drinks.set_index('country', inplace=True)

In [None]:
drinks.head()

In [None]:
drinks.continent.head()

In [None]:
drinks.continent.value_counts()

In [None]:
drinks.continent.value_counts()['Africa']

In [None]:
drinks.continent.value_counts().sort_values()

In [None]:
drinks.continent.value_counts().sort_index()

In [None]:
people = pd.Series([3000000, 85000], index=['Albania', 'Andorra'], name = 'population')
people

In [None]:
drinks.beer_servings * people

In [None]:
pd.concat([drinks, people], axis=1).head()

In [None]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')

In [None]:
drinks.head()

In [None]:
drinks.info()

In [None]:
drinks.info(memory_usage = 'deep')

In [None]:
drinks.memory_usage(deep=True)

In [None]:
drinks.memory_usage(deep=True).sum()

In [None]:
sorted(drinks.continent.unique())

In [None]:
drinks.continent.head()

In [None]:
drinks['continent'] = drinks.continent.astype('category')

In [None]:
drinks.dtypes

In [None]:
drinks.continent.head()

In [None]:
drinks.continent.cat.codes.head()

In [None]:
drinks.memory_usage(deep=True)

In [None]:
drinks['country'] = drinks.country.astype('category')

In [None]:
drinks.memory_usage(deep=True)

In [None]:
drinks.country.cat.categories

# Handling date and time in python

In [None]:
import pandas as pd
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

In [None]:
# Analyze sightings by year or time
# Check the dtypes
ufo.dtypes # time column shows an object - in this case a string

In [None]:
ufo.Time.str.slice(-5, -3).head()  #outputs as a string

In [None]:
ufo.Time.str.slice(-5, -3).astype(int).head()

In [None]:
# the above approach is very brittle. It easily breaks
# overwrite the time column. Overwrite the Time column.
ufo['Time'] = pd.to_datetime(ufo.Time)

In [None]:
ufo.head()

In [None]:
# real thing that's changed is dtype is now datetime
ufo.dtypes

In [None]:
# pandas just figures out the date. If not, there are lot of options in to_datetime
ufo.Time.dt.hour # Pulls out the hour
ufo.Time.dt.weekday_name[:4] # Pulls out the name of the week!
# Search the reference page for '.dt.'

In [None]:
# Let's pass a string instead of a series
pd.to_datetime('1/1/1999') #outputs a timestamp. Did not have to specify month and year

In [None]:
ts = pd.to_datetime('1/1/1999') # Save it for comparison

In [None]:
ufo.loc[ufo.Time >= ts, :].head() #only shows othe ufo's sighted after 1/1/1999

In [None]:
# can do math operations
ufo.Time.max() # Latest timestamp in the time series

In [None]:
ufo.Time.max() - ufo.Time.min() # time delta object tells the difference

In [None]:
#timedelta objects have attributes like .days
(ufo.Time.max() - ufo.Time.min()).days

In [None]:
# no of ufo reports by year. Plot!
%matplotlib inline
ufo['Year'] = ufo.Time.dt.year

In [None]:
ufo.head()

In [None]:
ufo.Year.value_counts().sort_index().plot()  #sort by order of index

# Handling inputs in date format
Following code converts any text input to appropriate date format

In [None]:
from datetime import datetime
from dateutil.parser import parse

while True:
    expiry = input('Expiry: ')
    try:
        parse(expiry)
    except ValueError:
        print("Enter date in any proper format")
    expiry = parse(expiry)
    break

expiry

# Vectorize

In [None]:
import pandas as pd
import numpy as np
import requests

# x = ['~', 'A', 'Sun']   # Works
x = ['~', 'Walter', 'A', 'Sun'] # Doesn't work because of Walter 
df = pd.DataFrame(x, columns=['x'])

u = "https://en.wikipedia.org/wiki/"

df['URL'] = u + df['x']

def tbl10(u):
    html = requests.get(u).content
    tbl = pd.read_html(u)[10]
    return tbl

v = np.vectorize(tbl10)
pd.concat(v(df.URL))


# Try-Except Error detection

In [None]:
while True:
    try:
        x = int(input("Please enter a number: "))
        break
    except ValueError:
        pass
        print("Oops! That was no valid number. Try again...")

# Examples of where

In [None]:
import pandas as pd, numpy as np
df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
df

In [None]:
m = df % 3 == 0
m

# Map and Apply experiments

In [None]:
import pandas as pd
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()

In [None]:
#translate sex to 1 and 0 using map
train['Sex_num'] = train.Sex.map({'female':0, 'male':1})

In [None]:
#compare sex and sex num
train.loc[0:4, ['Sex', 'Sex_num']]

In [None]:
#use apply method for a function
train['Name_len'] = train.Name.apply(len)

train.loc[0:4, ['Name', 'Name_len']]

In [None]:
import numpy as np
train['Fare_ceil'] = train.Fare.apply(np.ceil)

train.loc[0:4, ['Fare', 'Fare_ceil']]

train.Name.str.split(',').head()

In [None]:
def get_element(my_list, position):
    return my_list[position]

train.Name.str.split(',').apply(get_element, position=0).head()

In [None]:
train.Name.str.split(',').apply(lambda x: x[0]).head()

In [None]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

In [None]:
drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=1)

In [None]:
drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=0)

In [None]:
drinks.loc[:, 'beer_servings':'wine_servings'].apply(np.argmax, axis=1)

In [None]:
drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float).head()

In [None]:
drinks.loc[:, 'beer_servings':'wine_servings']=drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float)

drinks.head()

# Experiments with arguments

In [None]:
def varargs(*args):
    return args

varargs(1,2,3)

In [None]:
def keyword_args(**kwargs):
    return kwargs

keyword_args(big="foot", loch="ness")

In [None]:
def all_the_args(*args, **kwargs):
    print(args)
    print(kwargs)

all_the_args(1, 2, a=3, b=4)

In [None]:
args = (1, 2, 3, 4)
kwargs = {"a": 3, "b": 4}
all_the_args(*args)

In [None]:
all_the_args(*kwargs)

In [None]:
all_the_args(**kwargs)

In [None]:
all_the_args(*args, **kwargs)

# groupby in pandas

In [None]:
import pandas as pd
drinks = pd.read_csv('http://bit.ly/drinksbycountry')

In [None]:
drinks.head()

In [None]:
# average beer servings
drinks.beer_servings.mean()

In [None]:
# beer servings by continent
drinks.groupby('continent').beer_servings.mean()

In [None]:
# how does the work. Let us filter and see
drinks[drinks.continent == 'Africa'].beer_servings.mean()

In [None]:
# there are other functions too... e.g. max, min, etc.
drinks.groupby('continent').beer_servings.max()

In [None]:
# even powerful is .agg ... allows multiple aggregation functions
drinks.groupby('continent').beer_servings.agg(['count', 'min', 'max', 'mean'])

In [None]:
# if no aggregation is specified, it aggregates all numeric values
drinks.groupby('continent').mean()

In [None]:
# display in visual
%matplotlib inline

In [None]:
drinks.groupby('continent').mean().plot(kind='bar')

In [None]:
# how to display the groups?
g = drinks.groupby('continent')
for continent, continent_df in g:
    print(continent)
    print(continent_df)

In [None]:
# to get the specific dataframe
g.get_group('Africa').head()

In [None]:
# The concept is of split, apply and combine
drinks.groupby('continent').agg('max')  # groupby is split, agg is apply and max is combine

In [None]:
# gives all the detaiils
g.describe()

In [None]:
%matplotlib inline
g.plot()

# ib_insync

In [None]:
#***          Start ib_insync (run once)       *****
#_______________________________________________

from ib_insync import *
util.startLoop()
# ib = IB().connect('127.0.0.1', 3000, clientId=0) # kavi tws live
# ib = IB().connect('127.0.0.1', 3000, clientId=0) # kavi IBG live

# ib = IB().connect('127.0.0.1', 1300, clientId=0) # rkv tws live
# ib = IB().connect('127.0.0.1', 1300, clientId=0) # rkv IBG live


## Paths and Variables

In [None]:
#******         Paths and variables         ****
#_______________________________________________

datapath = r'./zdata/'

## Error catching in list comprehension

In [None]:
#******   Error catch in list comprehension  ****
#________________________________________________

def catch(func, handle=lambda e : e, *args, **kwargs):
    '''List comprehension error catcher'''
    try:
        return func(*args, **kwargs)
    except Exception as e:
        pass

# Get Standard Deviation for an NSE scrip
This function gets price for NSE scrips from IBKR

In [None]:
#... Get the scrip
symbol = 'RELIANCE'
contract = ib.qualifyContracts(Stock(symbol, exchange))[0]

#... Get stdev, hi52 and lo52
duration = '12 M'
size = '1 day'
bars = ib.reqHistoricalData(contract=contract, endDateTime='', 
                     durationStr=duration, barSizeSetting=size, 
                     whatToShow='TRADES', useRTH=True, 
                     formatDate=1, keepUpToDate=True)

stDev = np.std(a=[b.close for b in bars], ddof=0)

hi52 = max([b.high for b in bars])
lo52 = min([b.low for b in bars])

meanPrice = np.mean([b.close for b in bars])

# Get option prices with greeks (via tickers)
The best way to extract greeks and prices for options is to get them through the option chain and expiries as follows...

In [None]:
#... Get the scrip
symbol = 'RELIANCE'
contract = ib.qualifyContracts(Stock(symbol, exchange))[0]

#... Get the option chain tickers
chains = ib.reqSecDefOptParams(underlyingSymbol=contract.symbol, 
                      futFopExchange='', 
                      underlyingConId=contract.conId, underlyingSecType=contract.secType)

expiries = set(*[c.expirations for c in chains])

cds = [ib.reqContractDetails(Option(symbol, e, exchange='NSE')) for e in expiries]

options = [c.contract for cs in cds for c in cs]

tickers = [t for i in range(0, len(options), 100) for t in ib.reqTickers(*options[i:i + 100])]    
ib.sleep(5)   # gives some time to fill the tickers    
tickers = [t for i in range(0, len(options), 100) for t in ib.reqTickers(*options[i:i + 100])]

# keep only those tickers with underlying prices
lib_t = {t: utils.catch(lambda: t.modelGreeks.undPrice) for t in tickers}
und_t = [k for k, v in lib_t.items() if v is not None]

# Time conversion

In [None]:
import pandas as pd

# Generate list of datetimes
datetimes = pd.date_range(start="2013-05-18 12:00:00", periods=10, freq='m', tz="Europe/Brussels")
datetimes

In [None]:
# convert them to Singapore time
# check this out https://stackoverflow.com/questions/14004545
pd.Series(datetimes).dt.tz_convert('Asia/Singapore')