In [1]:
import numpy as np
import pandas as pd


# **Creating DataFrame**

In [2]:
# using lists

student_data = [
[100,80,10],
[90,70,7],
[120,100,14],
[80,50,2]
]

pd.DataFrame(student_data, columns=['iq','marks','package'])

Unnamed: 0,iq,marks,package
0,100,80,10
1,90,70,7
2,120,100,14
3,80,50,2


In [3]:
# using dicts

student_dict = {
     'name':['shivam','shubham', 'shivangi','shubhangi','manish','manisha'],
    'iq':[100,90,120,80,0,0],
    'marks':[80,70,100,50,0,0],
    'package':[10,7,14,2,0,0]
}

students = pd.DataFrame(student_dict)
students



Unnamed: 0,name,iq,marks,package
0,shivam,100,80,10
1,shubham,90,70,7
2,shivangi,120,100,14
3,shubhangi,80,50,2
4,manish,0,0,0
5,manisha,0,0,0


In [4]:
# set index without modifying actual dataframe

students.set_index('name')


Unnamed: 0_level_0,iq,marks,package
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
shivam,100,80,10
shubham,90,70,7
shivangi,120,100,14
shubhangi,80,50,2
manish,0,0,0
manisha,0,0,0


In [5]:
# set index inplace
students.set_index('name', inplace=True)
students

Unnamed: 0_level_0,iq,marks,package
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
shivam,100,80,10
shubham,90,70,7
shivangi,120,100,14
shubhangi,80,50,2
manish,0,0,0
manisha,0,0,0


In [6]:
# to reverse the inplace index set

students.reset_index(inplace=True)


In [7]:
# using read_csv
movies = pd.read_csv('movies.csv')
movies


FileNotFoundError: [Errno 2] No such file or directory: 'movies.csv'

In [8]:
ipl = pd.read_csv('ipl-matches.csv')
ipl

FileNotFoundError: [Errno 2] No such file or directory: 'ipl-matches.csv'

# **DataFrame Attributes and Methods**

In [None]:
# shape

movies.shape

In [None]:
ipl.shape

In [None]:
# dtypes
movies.dtypes

In [9]:
ipl.dtypes

NameError: name 'ipl' is not defined

In [10]:
# index
movies.index

NameError: name 'movies' is not defined

In [None]:
# columns
movies.columns

In [None]:
ipl.columns

In [None]:
# values
students.values

In [None]:
ipl.values

In [None]:
# head and tail
movies.head()

In [None]:
movies.head(7)

In [None]:
ipl.tail()

In [None]:
ipl.tail(8)

In [None]:
# sample
ipl.sample(8)

In [None]:
# info
movies.info()

In [None]:
# describe
movies.describe()

In [None]:
# isnull
movies.isnull().sum()

In [None]:
# duplicated
movies.duplicated().sum()

In [None]:
#rename
students

In [None]:
students.rename(columns={'marks':'percent','package':'lpa'})


# **Math Methods**

In [None]:
# sum -> axis argument
students.sum(axis=0) # col wise

In [None]:
students

In [None]:
students.set_index('name', inplace=True)

In [None]:
# mean
students.mean(axis=1)

In [None]:
# var

students.var()

# **Selecting cols from DataFrame**

In [None]:
# single cols
movies['actors']


In [None]:
type(movies['actors'])

In [None]:
# multiple cols
movies[['year_of_release','actors','title_x']]

In [None]:
ipl[['Team1','Team2','WinningTeam']]

# **Selecting Rows from a DataFrame**

*   **iloc -** searches using index positions
*   **loc -** searches using index labels



In [None]:
# single row
movies.iloc[4]

In [None]:
movies

In [None]:
# multiple rows
movies.iloc[:5]

In [None]:
# fancy indexing

movies.iloc[[0,4,5]]

In [None]:
# loc

students


In [None]:
students.loc['shivam']

In [None]:
students.loc['shivam':'manisha': 2]

In [None]:
students.loc[['shivam', 'shivangi','shubhangi']]

# **Selecting both rows and cols**

In [None]:
movies.iloc[0:3,0:3]

In [None]:
movies.iloc[3,3]

In [None]:
movies.loc[0:2,'title_x':'poster_path']

# **Filtering a DataFrame**

In [11]:
ipl.head()

NameError: name 'ipl' is not defined

In [None]:
ipl.head(2)

In [12]:
# find all the final winners

mask = ipl['MatchNumber'] == 'Final'
new_df = ipl[mask]
new_df[['Season','WinningTeam']]

NameError: name 'ipl' is not defined

In [None]:
# above thing can be done in one line too

ipl[ipl['MatchNumber'] == 'Final'] [['Season','WinningTeam']]

In [None]:
# How many super over finishes have occured
ipl.sample(3)

In [13]:
ipl[ipl['SuperOver'] == 'Y'].shape[0]

NameError: name 'ipl' is not defined

In [None]:
# How many matches has csk won in kolkata
ipl.head(2)

In [None]:
df = ipl[(ipl['City'] == 'Kolkata') & (ipl['WinningTeam'] == 'Chennai Super Kings')]

In [14]:
df.shape[0]

NameError: name 'df' is not defined

In [None]:
# Toss winner is match winner in percentage
(ipl[ipl['TossWinner'] == ipl['WinningTeam']].shape[0]/ipl.shape[0])*100

In [15]:
# movies with rating higher than 8 and votes>10000
movies.head(3)

NameError: name 'movies' is not defined

In [None]:
movies[(movies['imdb_rating'] > 8) & (movies['imdb_votes'] > 10000)].shape[0]

In [None]:
# Action movies with rating higher than 7.5
movies.sample(5)

In [16]:
mask1 = movies['genres'].str.split('|').apply(lambda x: 'Action' in x)

NameError: name 'movies' is not defined

In [None]:
mask2 = movies['imdb_rating'] > 7.5

In [17]:
movies[mask1 & mask2].shape[0]

NameError: name 'movies' is not defined

In [None]:
# another way
mask3 = movies['genres'].str.contains('Action')
movies[mask2 & mask3].shape[0]

In [None]:
# Write a function that can return the track record of 2 teams against each other


# **Adding new cols**

In [18]:
# completely new
movies['country'] = 'India'
movies.head()

NameError: name 'movies' is not defined

# **Important Functions**

In [None]:
# astype
ipl.info()

In [None]:
ipl['ID'] = ipl['ID'].astype('int32')

In [19]:
ipl.info()

NameError: name 'ipl' is not defined

In [None]:
ipl['Season'] = ipl['Season'].astype('category')

In [20]:
ipl.info()

NameError: name 'ipl' is not defined