In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('../input/test-data/data.csv')
df.head()

In [None]:
df = pd.read_csv('../input/test-data/data.csv',names=['key1','key2','text','name_col','extra_col'])
df.head()

In [None]:
df = pd.read_csv('../input/test-data/data.csv',names=['key1','key2','text','name_col'],index_col=['key1','key2'])
df

In [None]:
#Skip rows in a dataset
df = pd.read_csv('../input/test-data/data.csv',skiprows=[0,3])
df

In [None]:
#To check the null values
df = pd.read_csv('../input/test-data/data.csv',names=['key1','key2','text','name_col','extra_col'])
df.isnull()

In [None]:
#HTML reads only tabular format and stores each table in a seperate list index
score = pd.read_html('https://www.basketball-reference.com/leagues/NBA_2015_totals.html')
type(score)

In [None]:
#Correct way to read a data from html page
score[0]

In [None]:
#Read only single column from the data
df = score[0]
df['Player']

In [None]:
#Read multiple columns
df[['Player','Age']]

In [None]:
# To read all column names
df.columns

In [None]:
titanic = pd.read_csv('../input/titanic/train_and_test2.csv')
titanic.head()

In [None]:
#Read datatypes of each
titanic.dtypes

In [None]:
titanic.isnull().sum() #To take sum of all the null values

In [None]:
titanic = pd.read_csv('../input/titanic/train_and_test2.csv',nrows = 15) #Restricted to only 15 records
titanic

In [None]:
titanic.to_csv("./out.csv")

In [None]:
titanic.to_csv("./@seperated_data.csv",sep='@')

In [None]:
df = pd.read_csv('../input/test-data/data.csv',names=['key1','key2','text','name_col','extra_col'])
df

In [None]:
import sys

In [None]:
df.to_csv(sys.stdout,na_rep = 'utsav') 
#na_rep means" NaN replacement
#stdout is used to print the data in a standard format which can be copied easily

In [None]:
df

In [None]:
df.to_csv('out1.csv', index = False, header = False) #No index No header

In [None]:
date = pd.date_range('1/1/2020',periods = 7)
date

In [None]:
ts = pd.Series(np.arange(7),index=date)
ts

In [None]:
ts = pd.Series(np.arange(3),index=['a','b','c'])
ts

## Json Data

In [None]:
obj = """
{
	"id": "0001",
	"type": "donut",
	"name": "Cake",
	"image":
		{
			"url": "images/0001.jpg",
			"width": 200,
			"height": 200
		},
	"thumbnail":
		{
			"url": "images/thumbnails/0001.jpg",
			"width": 32,
			"height": 32
		}
}"""
type(obj)

In [None]:
#loads to convert the string into a dict
import json
result = json.loads(obj)
print(type(result))
result

In [None]:
#loads to convert the dict into string 
result1 = json.dumps(result)
print(type(result1))
result1

In [None]:
import requests

In [None]:
resp = requests.get('https://api.github.com/repos/pandas-dev/pandas/issues')
resp

In [None]:
data = resp.json()
data[0]['user']

# Pandas Data Manipulation

In [None]:
titanic_train = pd.read_csv("https://gist.githubusercontent.com/michhar/2dfd2de0d4f8727f873422c5d959fff5/raw/ff414a1bcfcba32481e4d4e8db578e55872a2ca1/titanic.csv",
                           sep='\t')   
titanic_train

In [None]:
titanic_train.describe() #Only describes Numeric values

In [None]:
titanic_train.isnull().sum()

In [None]:
np.where(titanic_train['Age'].isnull())

In [None]:
a = titanic_train.dtypes[titanic_train.dtypes == 'object'].index
a

In [None]:
titanic_train[a].describe() #Described the object values

In [None]:
titanic_train['Name'][:20] #First 20 names

In [None]:
sorted(titanic_train['Name'][0:20]) #First 20 names in sorted order

In [None]:
titanic_train['Name'].describe() #Describe only 1 column

In [None]:
#Convert Numeric to catagorical
p_class_new = pd.Categorical(titanic_train['Pclass'])
p_class_new

In [None]:
p_class_new.dtype

In [None]:
np.where(titanic_train["Fare"]==max(titanic_train["Fare"]))

In [None]:
#Using iloc to retrieve the rows of the given condition
titanic_train.iloc[np.where(titanic_train["Fare"]==max(titanic_train["Fare"]))]

In [None]:
from numpy.random import randn as rn

In [None]:
#Random seed helps to give same random numbers every time
np.random.seed(101)
matrix_data = rn(5,4)
matrix_data

In [None]:
#to convert data into dataframes
matrix_data = rn(5,4)
row_labels = ['A','B','C','D','E']
column_headings = ['W','X','Y','Z']

df = pd.DataFrame(matrix_data,row_labels,column_headings)
df

In [None]:
#iloc (ilocation) is used to retrieve rows using the index number
#loc is used to retrieve rows using row labels
print(df.iloc[3],'\n')
print(df.loc['D'])

In [None]:
#Rows and column extraction together
#1st and second rows with 2nd column
df.iloc[[0,1],[1]]

In [None]:
#Above data in loc
df.loc[['A','B'],['X']]

In [None]:
#axis = 0 means rows and axis = 1 means columns
df.drop('X',axis = 1)

In [None]:
df #if you do not write inplace = True, the drop is not permanent

In [None]:
df.drop('X',axis = 1, inplace = True)

In [None]:
df

In [None]:
#Alternative of inplace = True is to resign the complete database to another variable
variable = df.drop('W',axis = 1)

In [None]:
print(variable,'\n')
print(df)

In [None]:
matrix_data = np.matrix('22,66,140;42,70,148;30,62,125;35,68,160;25,62,152')
row_labels = ['A','B','C','D','E']
column_headings = ['Age', 'Height', 'Weight']

In [None]:
df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)
df

In [None]:
df['Height']<65

In [None]:
df[df['Height']<65]

In [None]:
df

In [None]:
#Remove indexes
df.reset_index(drop = True)

In [None]:
df['xyz'] = 'hello world My nameis Utsav'.split()

In [None]:
df

In [None]:
df.set_index('xyz')

In [None]:
#Note : zip operation creates list of the tuples

In [None]:
#multi-indexing
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))

In [None]:
print(hier_index)

In [None]:
hier_index = pd.MultiIndex.from_tuples(hier_index)
print(hier_index)
print(type(hier_index))

In [None]:
df1 = pd.DataFrame(data=np.round(rn(6,3)), index= hier_index, columns= ['A','B','C'])
df1

In [None]:
#Retrive data from multi-indexing
df1.loc['G1'].iloc[[0,1],1]

In [None]:
df = pd.DataFrame({'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]})
df['States']="CA NV AZ".split()
df.set_index('States',inplace=True)
df

In [None]:
#Require that many non-NA values.
#dropna removes null values
df.dropna(thresh = 2)

In [None]:
df

In [None]:
df.fillna('NA Filled')

In [None]:
df.fillna(value = df['A'].mean())

In [None]:
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(data)
df

In [None]:
group_operation = df.groupby('Company')
group_operation.mean()

In [None]:
df1=group_operation.describe()

In [None]:
group_operation.describe().iloc[[0,1],[1,2]] #extract only 1st 2 rows mean and std

In [None]:
df1.transpose()

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])

In [None]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[0, 1, 2, 3])


In [None]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8,9,10,11])

In [None]:
#concatenation
#Note : Rows names or column names (depending on the axis selected) must be same. Else it will add NaN values
df_cat1 = pd.concat([df1,df2,df3], axis=0)
print(df_cat1)

In [None]:
left = pd.DataFrame({'key': ['K0', 'K8', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})
print(left,'\n',right)

In [None]:
#Merge operation
merge = pd.merge(left,right,how = 'inner',on = 'key')
merge

In [None]:
merge = pd.merge(left,right,how = 'outer',on = 'key')
merge

In [None]:
#All left and only right merged
merge = pd.merge(left,right,how = 'left',on = 'key')
merge

### Merge operation merges the column keys while the Join operation joins using the indexed

In [None]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

In [None]:
print(left,'\n\n',right)

In [None]:
left.join(right)

In [None]:
# use of apply functions. It helps to call a external function or lambda function

In [None]:
# Define a function
def testfunc(x):
    if (x> 500):
        return (10*np.log10(x))
    else:
        return (x/10)

In [None]:
df = pd.DataFrame({'col1':[1,2,3,4,5,6,7,8,9,10],
                   'col2':[444,555,666,444,333,222,666,777,666,555],
                   'col3':'aaa bb c dd eeee fff gg h iii j'.split()})
df

In [None]:
df['col10'] = df['col2'].apply(testfunc)
df

In [None]:
df['FuncApplied'] = df['col2'].apply(lambda x : np.log(x))
print(df)

In [None]:
#Sorting
df.sort_values('col2')