# Requests Check

In [10]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
import requests
r = requests.get('https://www.python.org')
r

# Filtering rows by column value

In [2]:
import pandas as pd
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()

Unnamed: 0,star_rating,title,content_rating,genre,duration,actors_list
0,9.3,The Shawshank Redemption,R,Crime,142,"[u'Tim Robbins', u'Morgan Freeman', u'Bob Gunt..."
1,9.2,The Godfather,R,Crime,175,"[u'Marlon Brando', u'Al Pacino', u'James Caan']"
2,9.1,The Godfather: Part II,R,Crime,200,"[u'Al Pacino', u'Robert De Niro', u'Robert Duv..."
3,9.0,The Dark Knight,PG-13,Action,152,"[u'Christian Bale', u'Heath Ledger', u'Aaron E..."
4,8.9,Pulp Fiction,R,Crime,154,"[u'John Travolta', u'Uma Thurman', u'Samuel L...."


In [None]:
movies.shape

In [None]:
type(False)

In [None]:
# Create python list of booleans with the same length of the dataframe. 
#Boolean will be true if it is > 200 mins and false if others
booleans = []
for length in movies.duration:
    if length >= 200:
        booleans.append(True)
    else:
        booleans.append(False)

In [None]:
booleans[0:5]

In [None]:
len(booleans)

In [None]:
# convert booleans list to a pandas series
is_long = pd.Series(booleans)

In [None]:
is_long.head()

In [None]:
# pass is_long to the dataframe movies with bracket notation
movies[is_long].head()

#It shows up dataframe with all columns but only shows those with duration > 200 mins

In [None]:
# There is a shorter way - without the for loop.

#Instead:

In [None]:
# This completely replaces the need for a 'for' loop.
is_long = movies.duration >= 200 # Series (movies.duration)  ... comparison >=200 and returns series of trues and falses
is_long.head()

In [None]:
movies[is_long].head()

In [None]:
# We can eliminate is_long itself
movies[movies.duration >=200].head()

In [None]:
# If we are studying only the genre of the movies
# you can use dot notation

movies[movies.duration >=200].genre

# or in bracket notations
movies[movies['duration'] >=200]['genre']

#The above code may sometimes cause strange behaviour. Not the best way to do things.
#better practice is to use the .loc method

In [None]:
# use .loc with a comma
movies.loc[movies['duration'] >=200, 'genre']

#.loc allows selection of rows and columns by 'label'
# in the above movies['duration'] >= 200 are the rows and 'genre' are the columns

## ...For Multiple filter criteria

In [None]:
import pandas as pd
movies = pd.read_csv('http://bit.ly/imdbratings')
movies.head()

In [None]:
movies[movies.duration >= 200]

In [None]:
# how do we select only long movies of genre Drama?

In [None]:
True and True

In [None]:
movies[movies.duration >= 200 and movies.genre == 'Drama'] # will not work

In [None]:
# add parenthesis to add evaluation order
# add ampersand & instead of and

movies[(movies.duration >= 200) & (movies.genre == 'Drama')]

In [None]:
# or condition - gives a much bigger dataframe
movies[(movies.duration >= 200) | (movies.genre == 'Drama')].head()
movies[(movies.duration >= 200) | (movies.genre == 'Drama')].shape

In [None]:
# inside the bracket there is a boolean series which tells dataframe which rows display
((movies.duration >= 200) & (movies.genre == 'Drama'))[:5]

In [None]:
# what if there is a bunch of or conditions on the same series
# either crime or drama or action
# normally
movies[(movies.genre == 'Crime') | (movies.genre == 'Drama') | (movies.genre == 'Action')].head()

In [None]:
# the above is very wordy.
# to simplify we can use a series method called 'isin'
# it generates a boolean series
movies.genre.isin(['Crime', 'Drama', 'Action'])[:5]

In [None]:
# The above series can be passed to the DataFrame
movies[movies.genre.isin(['Crime', 'Drama', 'Action'])].head()

# Web Scraping

In [8]:
%%time
# Scraping all HTMLs table from a URL using BeautifulSoup
import pandas as pd
import requests
from bs4 import BeautifulSoup

res = requests.get("https://www.5paisa.com/5pit/spma.asp", verify=False)
soup = BeautifulSoup(res.content, 'lxml')
table = soup.find_all('table')
df = pd.read_html(str(table))
df



Wall time: 1.79 s


In [9]:
df

[                   0                        1
 0  Margin Calculator  Span Margin May 4, 2018,
              0      1       2        3        4              5             6   \
 0        Symbol   Mlot  SpMgn%  ExpMgn%  TotMgn%  SpMgnPerShare  ExpMgnPerShr   
 1    PCJEWELLER   1500   40.21    31.04    71.25          44.56       34.3923   
 2    JPASSOCIAT  34000   14.78    21.69    36.47           2.92        4.2838   
 3          RCOM  28000    15.6    20.54    36.14           2.52        3.3172   
 4           IRB   2500      11       20       31          29.81          54.2   
 5    BALRAMCHIN   3500   13.71       15    28.71           9.16         10.02   
 6        RNAVAL   9000   20.05     7.21    27.26           3.62        1.3014   
 7    JETAIRWAYS   1200   10.76       15    25.76          63.86       88.9575   
 8      JUSTDIAL   1400   12.31    13.38    25.69          50.25       54.5837   
 9          DHFL   1500    8.32       15    23.32          54.32       97.9275   
 10

In [None]:
# %%time
# Scraping an HTML table into pandas
import pandas as pd
url = "https://www.5paisa.com/5pit/spma.asp"
df = pd.read_html(url)[1]
df

In [None]:
from lxml import etree, html
import requests

url = "https://finance.google.com/finance?q=NSE:PFC"

page = requests.get(url)
root = html.fromstring(page.content)
dividend = float(root.findall('.//table')[2].text_content().strip().split("\n")[2].split('/')[0])
dividend

# Use of loc and iloc

In [None]:
## Pandas Index
## ...from Data School - ref: https://www.youtube.com/watch?v=OYZNk7Z9s6I
import pandas as pd
drinks = pd.read_csv('http://bit.ly/drinksbycountry')

In [None]:
drinks.head()

In [None]:
drinks.index

In [None]:
drinks.columns

In [None]:
drinks.shape

In [None]:
pd.read_table('http://bit.ly/movieusers', header = None, sep = '|').head()

In [None]:
drinks[drinks.continent == 'South America']

In [None]:
drinks.loc[23, 'beer_servings']

In [None]:
drinks.set_index('country', inplace=True)
drinks.head()

In [None]:
drinks.index

In [None]:
drinks.columns

In [None]:
drinks.shape

In [None]:
drinks.loc['Brazil', 'beer_servings']

In [None]:
drinks.index.name = None
drinks.head()

In [None]:
drinks.index.name = 'country'

In [None]:
drinks.reset_index(inplace = True)

In [None]:
drinks.head()

In [None]:
drinks.describe()

In [None]:
drinks.describe().loc['25%', 'beer_servings']

In [None]:
drinks.head()

In [None]:
drinks.continent.head()

In [None]:
drinks.set_index('country', inplace=True)

In [None]:
drinks.head()

In [None]:
drinks.continent.head()

In [None]:
drinks.continent.value_counts()

In [None]:
drinks.continent.value_counts()['Africa']

In [None]:
drinks.continent.value_counts().sort_values()

In [None]:
drinks.continent.value_counts().sort_index()

In [None]:
people = pd.Series([3000000, 85000], index=['Albania', 'Andorra'], name = 'population')
people

In [None]:
drinks.beer_servings * people

In [None]:
pd.concat([drinks, people], axis=1).head()

In [None]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')

In [None]:
drinks.head()

In [None]:
drinks.info()

In [None]:
drinks.info(memory_usage = 'deep')

In [None]:
drinks.memory_usage(deep=True)

In [None]:
drinks.memory_usage(deep=True).sum()

In [None]:
sorted(drinks.continent.unique())

In [None]:
drinks.continent.head()

In [None]:
drinks['continent'] = drinks.continent.astype('category')

In [None]:
drinks.dtypes

In [None]:
drinks.continent.head()

In [None]:
drinks.continent.cat.codes.head()

In [None]:
drinks.memory_usage(deep=True)

In [None]:
drinks['country'] = drinks.country.astype('category')

In [None]:
drinks.memory_usage(deep=True)

In [None]:
drinks.country.cat.categories

# Handling date and time in python

In [None]:
import pandas as pd
ufo = pd.read_csv('http://bit.ly/uforeports')
ufo.head()

In [None]:
# Analyze sightings by year or time
# Check the dtypes
ufo.dtypes # time column shows an object - in this case a string

In [None]:
ufo.Time.str.slice(-5, -3).head()  #outputs as a string

In [None]:
ufo.Time.str.slice(-5, -3).astype(int).head()

In [None]:
# the above approach is very brittle. It easily breaks
# overwrite the time column. Overwrite the Time column.
ufo['Time'] = pd.to_datetime(ufo.Time)

In [None]:
ufo.head()

In [None]:
# real thing that's changed is dtype is now datetime
ufo.dtypes

In [None]:
# pandas just figures out the date. If not, there are lot of options in to_datetime
ufo.Time.dt.hour # Pulls out the hour
ufo.Time.dt.weekday_name[:4] # Pulls out the name of the week!
# Search the reference page for '.dt.'

In [None]:
# Let's pass a string instead of a series
pd.to_datetime('1/1/1999') #outputs a timestamp. Did not have to specify month and year

In [None]:
ts = pd.to_datetime('1/1/1999') # Save it for comparison

In [None]:
ufo.loc[ufo.Time >= ts, :].head() #only shows othe ufo's sighted after 1/1/1999

In [None]:
# can do math operations
ufo.Time.max() # Latest timestamp in the time series

In [None]:
ufo.Time.max() - ufo.Time.min() # time delta object tells the difference

In [None]:
#timedelta objects have attributes like .days
(ufo.Time.max() - ufo.Time.min()).days

In [None]:
# no of ufo reports by year. Plot!
%matplotlib inline
ufo['Year'] = ufo.Time.dt.year

In [None]:
ufo.head()

In [None]:
ufo.Year.value_counts().sort_index().plot()  #sort by order of index

# Handling inputs in date format
Following code converts any text input to appropriate date format

In [1]:
from datetime import datetime
from dateutil.parser import parse

while True:
    expiry = input('Expiry: ')
    try:
        parse(expiry)
    except ValueError:
        print("Enter date in any proper format")
    expiry = parse(expiry)
    break

expiry

Expiry: May 13, 2018


datetime.datetime(2018, 5, 13, 0, 0)

# Vectorize

In [None]:
import pandas as pd
import numpy as np
import requests

# x = ['~', 'A', 'Sun']   # Works
x = ['~', 'Walter', 'A', 'Sun'] # Doesn't work because of Walter 
df = pd.DataFrame(x, columns=['x'])

u = "https://en.wikipedia.org/wiki/"

df['URL'] = u + df['x']

def tbl10(u):
    html = requests.get(u).content
    tbl = pd.read_html(u)[10]
    return tbl

v = np.vectorize(tbl10)
pd.concat(v(df.URL))


# Try-Except Error detection

In [None]:
while True:
    try:
        x = int(input("Please enter a number: "))
        break
    except ValueError:
        pass
        print("Oops! That was no valid number. Try again...")

# Examples of where

In [None]:
import pandas as pd, numpy as np
df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
df

In [None]:
m = df % 3 == 0
m

# Map and Apply experiments

In [None]:
import pandas as pd
train = pd.read_csv('http://bit.ly/kaggletrain')
train.head()

In [None]:
#translate sex to 1 and 0 using map
train['Sex_num'] = train.Sex.map({'female':0, 'male':1})

In [None]:
#compare sex and sex num
train.loc[0:4, ['Sex', 'Sex_num']]

In [None]:
#use apply method for a function
train['Name_len'] = train.Name.apply(len)

train.loc[0:4, ['Name', 'Name_len']]

In [None]:
import numpy as np
train['Fare_ceil'] = train.Fare.apply(np.ceil)

train.loc[0:4, ['Fare', 'Fare_ceil']]

train.Name.str.split(',').head()

In [None]:
def get_element(my_list, position):
    return my_list[position]

train.Name.str.split(',').apply(get_element, position=0).head()

In [None]:
train.Name.str.split(',').apply(lambda x: x[0]).head()

In [None]:
drinks = pd.read_csv('http://bit.ly/drinksbycountry')
drinks.head()

In [None]:
drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=1)

In [None]:
drinks.loc[:, 'beer_servings':'wine_servings'].apply(max, axis=0)

In [None]:
drinks.loc[:, 'beer_servings':'wine_servings'].apply(np.argmax, axis=1)

In [None]:
drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float).head()

In [None]:
drinks.loc[:, 'beer_servings':'wine_servings']=drinks.loc[:, 'beer_servings':'wine_servings'].applymap(float)

drinks.head()

# Experiments with arguments

In [None]:
def varargs(*args):
    return args

varargs(1,2,3)

In [None]:
def keyword_args(**kwargs):
    return kwargs

keyword_args(big="foot", loch="ness")

In [None]:
def all_the_args(*args, **kwargs):
    print(args)
    print(kwargs)

all_the_args(1, 2, a=3, b=4)

In [None]:
args = (1, 2, 3, 4)
kwargs = {"a": 3, "b": 4}
all_the_args(*args)

In [None]:
all_the_args(*kwargs)

In [None]:
all_the_args(**kwargs)

In [None]:
all_the_args(*args, **kwargs)