# Pandas

### Working with DataSet using Pandas
- Pandas is an Open Source Library 
- For Data Analysis in Python
- We will be using for reading and writing data between in-memory data structures and files - CSV's, Tex Files, SQL Databases, Excel Sheets etc.

- Reshaping, Slicing, Indexing, Merging and Joining Datasets

### Installation 
`pip install pandas`  
`

## Let's Work on a Movie Dataset 


In [43]:
import pandas as pd
import numpy as np

In [80]:
# Try to create a dataframe i.e some kind of table

user_data = {
    "MarksA":np.random.randint(1,100,5),
    "MarksB":np.random.randint(1,100,5),
    "MarksC":np.random.randint(1,100,5),
}

In [81]:
np.random.randint(1,100,5)

array([80,  3, 78, 53, 32])

In [82]:
user_data

{'MarksA': array([61,  2, 68, 45, 23]),
 'MarksB': array([ 8, 43, 57, 64, 50]),
 'MarksC': array([53, 22, 31, 74, 42])}

In [83]:
df = pd.DataFrame(user_data)
print(df)

   MarksA  MarksB  MarksC
0      61       8      53
1       2      43      22
2      68      57      31
3      45      64      74
4      23      50      42


In [84]:
df = pd.DataFrame(user_data,dtype='float32')
print(df)

   MarksA  MarksB  MarksC
0    61.0     8.0    53.0
1     2.0    43.0    22.0
2    68.0    57.0    31.0
3    45.0    64.0    74.0
4    23.0    50.0    42.0


In [85]:
df.head()

Unnamed: 0,MarksA,MarksB,MarksC
0,61.0,8.0,53.0
1,2.0,43.0,22.0
2,68.0,57.0,31.0
3,45.0,64.0,74.0
4,23.0,50.0,42.0


In [86]:
print(df.columns)

Index(['MarksA', 'MarksB', 'MarksC'], dtype='object')


In [87]:
# Creating a csv

df.to_csv('marks.csv')

In [88]:
# Reading csv

my_data=pd.read_csv('marks.csv')

In [89]:
my_data

Unnamed: 0.1,Unnamed: 0,MarksA,MarksB,MarksC
0,0,61.0,8.0,53.0
1,1,2.0,43.0,22.0
2,2,68.0,57.0,31.0
3,3,45.0,64.0,74.0
4,4,23.0,50.0,42.0


In [90]:
my_data = pd.read_csv('marks.csv')
my_data = my_data.drop(columns=['Unnamed: 0'])

In [91]:
my_data

Unnamed: 0,MarksA,MarksB,MarksC
0,61.0,8.0,53.0
1,2.0,43.0,22.0
2,68.0,57.0,31.0
3,45.0,64.0,74.0
4,23.0,50.0,42.0


# Part 2

In [92]:
#Suppose we need to check the average scrore of the class

my_data.describe()

Unnamed: 0,MarksA,MarksB,MarksC
count,5.0,5.0,5.0
mean,39.8,44.4,44.4
std,27.307508,21.801376,20.231164
min,2.0,8.0,22.0
25%,23.0,43.0,31.0
50%,45.0,50.0,42.0
75%,61.0,57.0,53.0
max,68.0,64.0,74.0


In [93]:
# TO see the starting rows we use the head

my_data.head()

Unnamed: 0,MarksA,MarksB,MarksC
0,61.0,8.0,53.0
1,2.0,43.0,22.0
2,68.0,57.0,31.0
3,45.0,64.0,74.0
4,23.0,50.0,42.0


In [94]:
my_data.head(n=2)

Unnamed: 0,MarksA,MarksB,MarksC
0,61.0,8.0,53.0
1,2.0,43.0,22.0


In [95]:
# To See the last rows we use the tail

my_data.tail()

Unnamed: 0,MarksA,MarksB,MarksC
0,61.0,8.0,53.0
1,2.0,43.0,22.0
2,68.0,57.0,31.0
3,45.0,64.0,74.0
4,23.0,50.0,42.0


In [96]:
my_data.tail(n=2)

Unnamed: 0,MarksA,MarksB,MarksC
3,45.0,64.0,74.0
4,23.0,50.0,42.0


In [97]:
# Accessing a particular row

df.iloc[3]

MarksA    45.0
MarksB    64.0
MarksC    74.0
Name: 3, dtype: float32

In [98]:
# Accessing particular row and column

df.iloc[3,1]

64.0

In [99]:
df.iloc[3][1]

64.0

In [100]:
idx = df.columns.get_loc('MarksB')
df.iloc[3,idx]

64.0

In [101]:
idx = [df.columns.get_loc('MarksB'),df.columns.get_loc('MarksC')]
print(idx)
df.iloc[3,idx]

[1, 2]


MarksB    64.0
MarksC    74.0
Name: 3, dtype: float32

In [102]:
#Printing first 3 rows

df.iloc[:3,idx]

Unnamed: 0,MarksB,MarksC
0,8.0,53.0
1,43.0,22.0
2,57.0,31.0


In [103]:
df.iloc[:3,[1,2]]

Unnamed: 0,MarksB,MarksC
0,8.0,53.0
1,43.0,22.0
2,57.0,31.0


In [104]:
# Sorting your dataframe

my_data

Unnamed: 0,MarksA,MarksB,MarksC
0,61.0,8.0,53.0
1,2.0,43.0,22.0
2,68.0,57.0,31.0
3,45.0,64.0,74.0
4,23.0,50.0,42.0


In [105]:
my_data.sort_values(by=["MarksA"],ascending=True)

Unnamed: 0,MarksA,MarksB,MarksC
1,2.0,43.0,22.0
4,23.0,50.0,42.0
3,45.0,64.0,74.0
0,61.0,8.0,53.0
2,68.0,57.0,31.0


In [106]:
my_data.sort_values(by=["MarksA"],ascending=False)

Unnamed: 0,MarksA,MarksB,MarksC
2,68.0,57.0,31.0
0,61.0,8.0,53.0
3,45.0,64.0,74.0
4,23.0,50.0,42.0
1,2.0,43.0,22.0


In [107]:
my_data.sort_values(by=["MarksC","MarksA"],ascending=False)

Unnamed: 0,MarksA,MarksB,MarksC
3,45.0,64.0,74.0
0,61.0,8.0,53.0
4,23.0,50.0,42.0
2,68.0,57.0,31.0
1,2.0,43.0,22.0


In [108]:
data_array = my_data.values

In [109]:
print(type(my_data))
print(my_data.shape)

<class 'pandas.core.frame.DataFrame'>
(5, 3)


In [110]:
 print(data_array)

[[61.  8. 53.]
 [ 2. 43. 22.]
 [68. 57. 31.]
 [45. 64. 74.]
 [23. 50. 42.]]


In [111]:
print(type(data_array))

<class 'numpy.ndarray'>


In [112]:
data_array.shape

(5, 3)

In [114]:
data_array[2][2]

31.0

In [116]:
#conversion of Numpy array back into data frame

new_df = pd.DataFrame(data_array,dtype='int32',columns=["Physics","Chem","Maths"])

In [117]:
new_df

Unnamed: 0,Physics,Chem,Maths
0,61,8,53
1,2,43,22
2,68,57,31
3,45,64,74
4,23,50,42


In [118]:
new_df.to_csv("PCM.csv")

In [119]:
 new_df.to_csv?