In [None]:
import numpy as np
import pandas as pd

## Creating a DataFrame




*These are the different ways in which one can create a DataFrame*

In [None]:
# Using a Dict with an Index
data = {"a" : [4 ,5, 6], "b" : [7, 8, 9], "c" : [10, 11, 12]}
index = ['Ram', 'Sham', 'Bill']

df = pd.DataFrame(data,index=index)
df

In [None]:
# Using a List of Lists
data = [[4 ,5, 6], [7, 8, 9], [10, 11, 12]]
col = ['a','b','c']
index = ['Ram', 'Sham', 'Bill']

df = pd.DataFrame(data,index=index,columns=col)
df

In [None]:
# Using a Dict without an Index
data = {"a" : [4 ,5, 6], "b" : [7, 8, 9], "c" : [10, 11, 12]}
df = pd.DataFrame(data)
df

In [None]:
# Using List of Dicts
data = [{'a': 4, 'b': 7, 'c':10}, {'a':5, 'b': 8, 'c': 11}, {'a':6, 'b': 9, 'c': 12}]  
df = pd.DataFrame(data) 
df 

## Reset Index/Drop Index Row

*To drop the index row*

In [None]:
df

In [None]:
df.reset_index(inplace = False)

In [None]:
df # Because Inplace is False change is not made in the original

In [None]:
df.reset_index(inplace = True)

In [None]:
df

## df.loc and df.iloc

**loc** *is used when we know the index values or when we represent them in the way they actually are.*

**iloc** *is used when we represent the index numerically.*

Remember that loc and iloc are used to access elements of the dataframe using the index values

In [None]:
# Creating a DataFrame

data = {"a" : [4 ,5, 6], "b" : [7, 8, 9], "c" : [10, 11, 12]}
index = ['Ram', 'Sham', 'Bill']

df = pd.DataFrame(data,index=index)
df

In [None]:
df.loc['Ram'] # Same as df.iloc[0]
#df.iloc[0]

In [None]:
# We want a all the rows where a is greater than 5

df.loc[(df['a'] > 5)] # Same as df[df['a'] > 5] 
# df.iloc[(df['a'] > 5)] will return an error as iloc indexing cannot use an indexable as a mask

In [None]:
# Say we want only the 'c' column for all rows where 'b' >= 8

df.loc[df['b'] >= 8, 'c'] # Same as df.loc[df['b'] >= 8]['c']

In [None]:
# We can also use .loc to access particular elements given the index and column

df.loc['Ram']['a'] # Same as df.loc['Ram', 'a']

In [None]:
# Say we want to return some n columns for all the indexes

df.loc[:,['a','b']] # Same as df.loc[:][['a','b']]

## Groupby 

*Used to split the data based into groups based on some criteria*

In [None]:
df = pd.read_csv("nba.csv") # Let us import the dataframe
df 

In [None]:
# Create a group based on the team

obj1 = df.groupby("Team") 

# This will print all the first elements of every group.

obj1.first() 

In [None]:
# Now say we ant to groupby both the team and Position

obj2 = df.groupby(["Team", "Position"]) 

# This will print all the first elements of every group.

obj2.first() 

In [None]:
# Say we want to know the average age of every team

obj1['Age'].mean()

## Adding a new row or column to the dataframe

In [None]:
# Creating a DataFrame

data = {"a" : [4 ,5, 6], "b" : [7, 8, 9], "c" : [10, 11, 12]}
index = ['Ram', 'Sham', 'Bill']

df = pd.DataFrame(data,index=index)
df

In [None]:
# Adding a new row

df.loc['New Name'] = [1,2,3]

In [None]:
df

In [None]:
# Adding a new column

df['d'] = np.random.randint(1, 6, df.shape[0]) 

# Similar to df['d'] = pd.Series(np.random.randint(1,10), index=df.index)


In [None]:
df

## Dropping a Row/Column from a DataFrame

In [None]:
df

In [None]:
df.drop("New Name", axis = 0) # Dropping a row from the Dataframe

# Since Inplace was not set to true the row will still be there in the dataframe.

In [None]:
df.drop("d", axis = 1) # Dropping a column from the DataFrame

# Since Inplace was not set to true the column will still be there in the dataframe.

In [None]:
# Permanently Dropping both the 'd' column and the "New Name" row

df.drop("New Name", axis = 0, inplace=True) # Dropping a row from the Dataframe
df.drop("d", axis = 1, inplace=True) # Dropping a column from the DataFrame

In [None]:
df

In [None]:
# Dropping multiple columns

df.drop(['a','b'],axis = 1) # Same as df.drop(columns=['a','b'],axis = 0)

In [None]:
# Dropping multiple rows

df.drop(df.index[[0,1]])

In [None]:
# Dropping rows based on a certain condition

df.drop(df[df['a'] > 5].index) # Dropping rows where the 'a' column has a value greater than 5

## Some other useful Pandas Commands

### Replacing Value(s) in a Column of the DataFrame

In [None]:
df

In [None]:
df.replace(4,'e') # Replaces the value 4 in the dataframe with the letter 'e'

### value_counts()

*Used to return the counts of unique values in a given series* 

In [None]:
df

In [None]:
df['a'].value_counts()

### notna()

*Returns True if the values are not null*

In [None]:
df.notna()

### notnull()

*Similar to notna()*

In [None]:
df.notnull()

### append()

*Used to append a new row to a given dataframe*


*It also can append a dataframe to the given dataframe*

In [None]:
df1 = pd.DataFrame({"a":[1, 2, 3, 4], "b":[5, 6, 7, 8]}) 
df1

In [None]:
# Creating the Second Dataframe using dictionary 
df2 = pd.DataFrame({"a":[1, 2, 3], "b":[5, 6, 7]}) 
df2

In [None]:
# to append df2 at the end of df1 dataframe 
df1.append(df2) 

# Notice that the index values are not continuous

In [None]:
# A continuous index value will be maintained 
# across the rows in the new appended data frame. 
df1.append(df2, ignore_index = True) 

In [None]:
# Appending a DataFrame with a different Shape

df3 = pd.DataFrame({"a":[1, 2, 3], "b":[5, 6, 7], "c":[1, 5, 4]}) 
df3 # The new DataFrame we want to append

In [None]:
df1.append(df3)
# The new cells will be populated with NaNs

### dtypes

*To identify the data type of the rows/columns of the dataframe*

In [None]:
df

In [None]:
df.dtypes

### astype()

*Casting a particular pandas object to a specific dtype*

In [None]:
df = df.astype('float64') #astype has no inplace function
df.dtypes

In [None]:
# To convert only column 'a' back to int

df = df.astype({'a':'int64'})
df.dtypes

### apply()

*The apply() function is used to apply a function along an axis of the DataFrame.*

In [None]:
df

In [None]:
# Updating the values in the dataframe by replacing them with the square of the number.
df = df.apply(lambda x:x**2) 
df

In [None]:
df.apply(np.sum, axis=0) # Finding the sum of all elements along every column

In [None]:
df.apply(np.sum, axis=1) # Finding the sum of all elements along every row

### pd.to_numeric()

*It converts argument to a numeric type.*

Parameters :

* **errors{‘ignore’, ‘raise’, ‘coerce’}, default ‘raise’**
    *  If **‘raise’**, then invalid parsing will raise an exception.

    *  If **‘coerce’**, then invalid parsing will be set as NaN.

    *  If **‘ignore’**, then invalid parsing will return the input.
    
    
* **downcast{‘integer’, ‘signed’, ‘unsigned’, ‘float’}, default None**
     * If not **None**, and if the data has been successfully cast to a numerical dtype (or if the data was numeric to begin with), downcast that resulting data to the smallest numerical dtype possible according to the following rules:
         * **‘integer’ or ‘signed’**: smallest signed int dtype (min.: np.int8)
         * **‘unsigned’**: smallest unsigned int dtype (min.: np.uint8)
         * **‘float’**: smallest float dtype (min.: np.float32)

In [None]:
s = pd.Series(['1.0', '2', -3])
pd.to_numeric(s)

In [None]:
pd.to_numeric(s, downcast='float')

In [None]:
pd.to_numeric(s, downcast='signed')

In [None]:
s = pd.Series(['apple', '1.0', '2', -3])
pd.to_numeric(s, errors='ignore')

In [None]:
# This will return an error
pd.to_numeric(s, errors='raise')

In [None]:
pd.to_numeric(s, errors='coerce') # Replace with NaN

### str.contains()

*Used to find the string values in the dataframe*

In [None]:
df = pd.DataFrame(dict(A=[5,3,5,6], C=["foo","bar","fooXYZbar", "bat"]))
df

In [None]:
df['C'].str.contains("foo")

In [None]:
# Return a DataFrame in which the string foo is present
df[df['C'].str.contains("foo")]