### 1) Pandas

In [2]:
import pandas as pd
df = pd.read_csv("Book1.csv")

df.head()

Unnamed: 0,Key,Key2,c,d
0,1,1.0,1.0,1.0
1,1,2.0,2.0,2.0
2,# yo this aint a row,,,
3,1,3.0,3.0,3.0
4,1,5.0,5.0,5.0


In [4]:
df = pd.read_csv("Book1.csv", header = None)

df.head()

Unnamed: 0,0,1,2,3
0,a,b,c,d
1,1,1,1,1
2,2,2,2,2
3,3,3,3,3
4,3,3,3,3


###### to change the names of columns

In [7]:
df = pd.read_csv("Book1.csv",names = ["A","B","C","D"])

df.head()

Unnamed: 0,A,B,C,D
0,a,b,c,d
1,1,1,1,1
2,2,2,2,2
3,3,3,3,3
4,3,3,3,3


###### creating index columns

In [12]:
df = pd.read_csv("Book1.csv",index_col= ["Key","Key2"])

df.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,c,d
Key,Key2,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,1,1
1,2,2,2
1,3,3,3
1,3,3,3
1,5,5,5
2,1,6,6
2,2,7,7
2,3,8,8
2,3,9,9
2,5,10,10


###### skipping over rows that have garbage values

In [16]:
df = pd.read_csv("Book1.csv",skiprows = [3])

df.head()

Unnamed: 0,Key,Key2,c,d
0,1,1,1,1
1,1,2,2,2
2,1,3,3,3
3,1,5,5,5
4,2,1,6,6


###### How to scrape dataframes from websites using pandas?

In [5]:
url ="https://www.basketball-reference.com/leagues/NBA_2015_totals.html"
web_data = pd.read_html(url) #returns a list of dataframes that it
                             #found on the site

In [10]:
web_data[1] #meaining we have just one dataset

In [12]:
df = web_data[0]
df.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Quincy Acy,PF,24,NYK,68,22,1287,152,331,...,0.784,79,222,301,68,27,22,60,147,398
1,2,Jordan Adams,SG,20,MEM,30,0,248,35,86,...,0.609,9,19,28,16,16,7,14,24,94
2,3,Steven Adams,C,21,OKC,70,67,1771,217,399,...,0.502,199,324,523,66,38,86,99,222,537
3,4,Jeff Adrien,PF,28,MIN,17,0,215,19,44,...,0.579,23,54,77,15,4,9,9,30,60
4,5,Arron Afflalo,SG,29,TOT,78,72,2502,375,884,...,0.843,27,220,247,129,41,7,116,167,1035


###### whenever we want to access multiple columns we have to pass it as a list

In [13]:
df[['MP',"Tm"]]

Unnamed: 0,MP,Tm
0,1287,NYK
1,248,MEM
2,1771,OKC
3,215,MIN
4,2502,TOT
...,...,...
670,2434,TOT
671,1605,MIN
672,829,BRK
673,1487,CHO


In [16]:
type(df[["MP","Tm"]])

pandas.core.frame.DataFrame

###### How to limit the number of rows that are read into pandas

###### Sometimes if we have a Large dataset which does not fit in our memory then we can divide it into chunks and read it, like this.

In [52]:
df = pd.read_csv("Book1.csv",chunksize = 1) #use nrows operation

df

<pandas.io.parsers.TextFileReader at 0x1d16067d4e0>

###### One chunk is basically one row of the dataframe

In [53]:

chunk_list = []  # append each chunk df here 

# Each chunk is in df format
for chunk in df:
    chunk_list.append(chunk)
    
# concat the list into dataframe 
df_concat = pd.concat(chunk_list)

In [54]:
chunk_list[1]

Unnamed: 0,Key,Key2,c,d
5,2,1,6,6
6,2,2,7,7
7,2,3,8,8
8,2,3,9,9
9,2,5,10,10


In [55]:
df_concat

Unnamed: 0,Key,Key2,c,d
0,1,1.0,1.0,1.0
1,1,2.0,2.0,2.0
2,# yo this aint a row,,,
3,1,3.0,3.0,3.0
4,1,5.0,5.0,5.0
5,2,1.0,6.0,6.0
6,2,2.0,7.0,7.0
7,2,3.0,8.0,8.0
8,2,3.0,9.0,9.0
9,2,5.0,10.0,10.0


###### DateRange Pandas

In [63]:
dates = pd.date_range('1/1/2020', periods = 10)
dates

DatetimeIndex(['2020-01-01', '2020-01-02', '2020-01-03', '2020-01-04',
               '2020-01-05', '2020-01-06', '2020-01-07', '2020-01-08',
               '2020-01-09', '2020-01-10'],
              dtype='datetime64[ns]', freq='D')

In [68]:
import numpy as np

ds = pd.Series(np.arange(10), index = [dates])
ds

2020-01-01    0
2020-01-02    1
2020-01-03    2
2020-01-04    3
2020-01-05    4
2020-01-06    5
2020-01-07    6
2020-01-08    7
2020-01-09    8
2020-01-10    9
dtype: int32

In [69]:
ds.to_csv("myTimeSeries.csv")