## getting started with pandas

In [47]:
# pip install pandas #done

In [3]:
import pandas as pd

In [4]:
mydataset = {
    'cars':["BMW","volvo","Ford"],
    'passings':[3,7,2]
}

myvar =  pd.DataFrame(mydataset)

print(myvar)

    cars  passings
0    BMW         3
1  volvo         7
2   Ford         2


## checking pandas version

In [5]:
print(pd.__version__)

1.5.3


### What is a Series?

##### A Pandas Series is like a column in a table. It is a one-dimensional array holding data of any type.


In [6]:
x = [1,7,2]

myvar = pd.Series(x)

print(myvar)

0    1
1    7
2    2
dtype: int64


### Create Labels
With the index argument, you can name your own labels.

In [7]:
# we can replace above 0 1 2 serials with own numbering example :
x =[5,6,7]

myvar = pd.Series(x,index = ["x","y","z"])

print(myvar)

x    5
y    6
z    7
dtype: int64


In [8]:
print(myvar["y"])

6


## Key/Value Objects as Series

#### Note: The keys of the dictionary become the labels.

In [9]:
calories = {"day1": 420, "day2": 380, "day3": 390}

In [11]:
myvar = pd.Series(calories)

print(myvar)

day1    420
day2    380
day3    390
dtype: int64


### if we add custom label of only day1 and day2 only data of day1 & day 2 will come and other remainig will vanish :

In [19]:
import pandas as pd

myvar = pd.Series(calories,index = ["day1","day2"])

print(myvar)

day1    420
day2    380
dtype: int64


In [12]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

In [13]:
myvar = pd.DataFrame(data)

print(myvar)

   calories  duration
0       420        50
1       380        40
2       390        45


## Locate Row
As you can see from the result above, the DataFrame is like a table with rows and columns.
Pandas use the loc attribute to return one or more specified row(s)

In [14]:
#refer to the row index:
print(myvar.loc[0])

calories    420
duration     50
Name: 0, dtype: int64


### here we located the row ZERO and python returned its values

In [15]:
#use a list of indexes:
print(myvar.loc[[0, 1]])

   calories  duration
0       420        50
1       380        40


In [16]:
# NOW lets add a custom index to data

myvar = pd.DataFrame(data,index=['day1','day2','day3'])

In [17]:
myvar

Unnamed: 0,calories,duration
day1,420,50
day2,380,40
day3,390,45


In [18]:
myvar.loc["day3"]

calories    390
duration     45
Name: day3, dtype: int64

# opening files in PANDAS

In [19]:
df = pd.read_csv("data.csv")

print(df) # df.to_string

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
..        ...    ...       ...       ...
164        60    105       140     290.8
165        60    110       145     300.0
166        60    115       145     310.2
167        75    120       150     320.4
168        75    125       150     330.4

[169 rows x 4 columns]


### Tip: use to_string() to print the entire DataFrame.

# max_rows

pd.options.display.max_rows    

You can check your system's maximum rows with the pd.options.display.max_rows statement.


In [20]:
pd.options.display.max_rows

60

number is 60, which means that if the DataFrame contains more than 60 rows, the print(df) statement will return only the headers and the first and last 5 rows.

#### Increase the maximum number of rows to display the entire DataFrame:

In [22]:
pd.options.display.max_rows = 9

df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
...,...,...,...,...
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4
168,75,125,150,330.4


# Read JSON

In [24]:
import pandas as pd

df = pd.read_json("data.json")

In [25]:
df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
...,...,...,...,...
165,60,110,145,300.4
166,60,115,145,310.2
167,75,120,150,320.4
168,75,125,150,330.4


In [27]:
data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

In [28]:
df = pd.DataFrame(data)

df

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409
1,60,117,145,479
2,60,103,135,340
3,45,109,175,282
4,45,117,148,406
5,60,102,127,300


# Pandas - Analyzing DataFrames

Viewing the Data : use head() , this fn returns th header and a specified no of rows ,starting from top.

In [34]:
df = pd.read_csv('data.csv')

print(df.head(10))

    Duration  Pulse  Maxpulse  Calories
0         60    110       130     409.1
1         60    117       145     479.0
2         60    103       135     340.0
3         45    109       175     282.4
..       ...    ...       ...       ...
6         60    110       136     374.0
7         45    104       134     253.3
8         30    109       133     195.1
9         60     98       124     269.0

[10 rows x 4 columns]


In [35]:
df = pd.read_json('data.json')

print(df.head())

   Duration  Pulse  Maxpulse  Calories
0        60    110       130     409.1
1        60    117       145     479.0
2        60    103       135     340.0
3        45    109       175     282.4
4        45    117       148     406.0


### There is also a tail() method for viewing the last rows of the DataFrame. The tail() method returns the headers and a specified number of rows, starting from the bottom.

In [36]:
df.tail()

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
164,60,105,140,290.8
165,60,110,145,300.4
166,60,115,145,310.2
167,75,120,150,320.4
168,75,125,150,330.4


### Info About the Data
The DataFrames object has a method called info(), that gives you more information about the data set.

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 6.6 KB


# pandas basic re-practise

In [None]:
import pandas as pd

# 1. From Python Dictionary

In [39]:
my_dict = {
    "Name": ['Hari', 'Ram', 'Shyam', 'Asmita', 'Alisha'],
    "Age": [24, 23, 32, 21, 20],
    "Gender": ['Male', 'Male', 'Male', 'Female', 'Female']
}

In [41]:
df = pd.DataFrame(my_dict)

In [46]:
print(df)

     Name  Age  Gender
0    Hari   24    Male
1     Ram   23    Male
2   Shyam   32    Male
3  Asmita   21  Female
4  Alisha   20  Female


# 2. From a list of dictionaries

In [48]:
list_of_dict = [{'Name': 'Hari', 'Age': 24, 'Gender': 'Male'},
                  {'Name': 'Ram', 'Age': 23, 'Gender': 'Male'},
                  {'Name': 'Shyam', 'Age': 32, 'Gender': 'Male'},
                  {'Name': 'Asmita', 'Age': 21, 'Gender': 'Female'},
                  {'Name': 'Alisha', 'Age': 20, 'Gender': 'Female'}]

In [49]:
df = pd.DataFrame(list_of_dict)

In [50]:
df

Unnamed: 0,Name,Age,Gender
0,Hari,24,Male
1,Ram,23,Male
2,Shyam,32,Male
3,Asmita,21,Female
4,Alisha,20,Female


# From a list of tuples

In [61]:
list_of_tuples = [('Roshan', 24,'Male'),
                  ('Ram', 23, 'Male'),
                  ('Shyam', 32, 'Male'),
                  ('Asmita', 21, 'Female'),
                  ('Alisha', 20, 'Female')]

In [62]:
df = pd.DataFrame(list_of_tuples,columns = ['Name','Age','Gender'])

In [63]:
df

Unnamed: 0,Name,Age,Gender
0,Roshan,24,Male
1,Ram,23,Male
2,Shyam,32,Male
3,Asmita,21,Female
4,Alisha,20,Female


# From list of lists

In [64]:
list_of_lists = [
    ['Roshan', 24,'Male'],
    ['Ram', 23, 'Male'],
    ['Shyam', 32, 'Male'],
    ['Asmita', 21, 'Female'],
    ['Alisha', 20, 'Female']
]

In [65]:
df = pd.DataFrame(list_of_lists,columns=['Naam','Umer','Linga'])

In [66]:
df

Unnamed: 0,Naam,Umer,Linga
0,Roshan,24,Male
1,Ram,23,Male
2,Shyam,32,Male
3,Asmita,21,Female
4,Alisha,20,Female


In [67]:
cf = pd.DataFrame(list_of_lists)

In [68]:
cf

Unnamed: 0,0,1,2
0,Roshan,24,Male
1,Ram,23,Male
2,Shyam,32,Male
3,Asmita,21,Female
4,Alisha,20,Female


### yedi hamle aafai columns diyenam bhane , 1st row /heading wala row aaudaina

# Question:
Read 'weather_data.csv' file using csv reader.<br>
Store the data inside the csv file into a list of lists.<br>
Then create a pandas dataframe using list of list.


In [76]:
import os

In [73]:
from csv import reader

In [80]:
file_path = os.path.join(os.getcwd(),"csv_data","weather_data.csv")

In [81]:
csv_file = open(file_path)

In [82]:
file_read = reader(csv_file)

In [83]:
data = list(file_read)

In [84]:
data

[['kfjkdfjskd'],
 ['dfuhsdjufio'],
 ['day', 'temperature', 'windspeed', 'event'],
 ['1/1/2017', '32', '6', 'Rain'],
 ['1/4/2017', 'not available', '9', 'Sunny'],
 ['1/5/2017', '-1', 'not measured', 'Snow'],
 ['1/6/2017', 'not available', '7', 'no event'],
 ['1/7/2017', '32', 'not measured', 'Rain'],
 ['1/8/2017', 'not available', 'not measured', 'Sunny'],
 ['1/9/2017', 'not available', 'not measured', 'no event'],
 ['1/10/2017', '34', '8', 'Cloudy'],
 ['1/11/2017', '-4', '-1', 'Snow'],
 ['1/12/2017', '26', '12', 'Sunny'],
 ['1/13/2017', '12', '12', 'Rainy'],
 ['1/11/2017', '-1', '12', 'Snow'],
 ['1/14/2017', '40', '-1', 'Sunny']]

In [86]:
newlist = data[2:]

In [87]:
newlist

[['day', 'temperature', 'windspeed', 'event'],
 ['1/1/2017', '32', '6', 'Rain'],
 ['1/4/2017', 'not available', '9', 'Sunny'],
 ['1/5/2017', '-1', 'not measured', 'Snow'],
 ['1/6/2017', 'not available', '7', 'no event'],
 ['1/7/2017', '32', 'not measured', 'Rain'],
 ['1/8/2017', 'not available', 'not measured', 'Sunny'],
 ['1/9/2017', 'not available', 'not measured', 'no event'],
 ['1/10/2017', '34', '8', 'Cloudy'],
 ['1/11/2017', '-4', '-1', 'Snow'],
 ['1/12/2017', '26', '12', 'Sunny'],
 ['1/13/2017', '12', '12', 'Rainy'],
 ['1/11/2017', '-1', '12', 'Snow'],
 ['1/14/2017', '40', '-1', 'Sunny']]

In [94]:
# df = pd.DataFrame(newlist)

In [95]:
# df

In [96]:
columns = newlist[:1]

In [97]:
columns

[['day', 'temperature', 'windspeed', 'event']]

In [102]:
df = pd.DataFrame(newlist[2:], columns = columns)

In [105]:
df.head(3)

Unnamed: 0,day,temperature,windspeed,event
0,1/4/2017,not available,9,Sunny
1,1/5/2017,-1,not measured,Snow
2,1/6/2017,not available,7,no event


In [107]:
df.tail(3)

Unnamed: 0,day,temperature,windspeed,event
9,1/13/2017,12,12,Rainy
10,1/11/2017,-1,12,Snow
11,1/14/2017,40,-1,Sunny


# 5. Pandas Dataframe From Csv files

### now lets read the weather_data file using csv files in pandas

In [7]:
file_path = os.path.join(os.getcwd(),"csv_data","weather_data.csv")

In [8]:
file_path

'C:\\Users\\Roshan KC\\Downloads\\6months_datascience_challange\\csv_data\\weather_data.csv'

In [9]:
import pandas as pd
import os

In [15]:
file = pd.read_csv(file_path,skiprows=2)

ValueError: header must be integer or list of integers

In [13]:
file

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/4/2017,not available,9,Sunny
2,1/5/2017,-1,not measured,Snow
3,1/6/2017,not available,7,no event
4,1/7/2017,32,not measured,Rain
5,1/8/2017,not available,not measured,Sunny
6,1/9/2017,not available,not measured,no event
7,1/10/2017,34,8,Cloudy
8,1/11/2017,-4,-1,Snow
9,1/12/2017,26,12,Sunny


one more example :

lets read a car details csv file from csv_data folder

In [121]:
file_path = os.path.join(os.getcwd(),"csv_data","car_details.csv")

In [122]:
df = pd.read_csv(file_path)

In [123]:
df

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
...,...,...,...,...,...,...,...,...
4336,Hyundai i20 Magna 1.4 CRDi,2014,409999,80000,Diesel,Individual,Manual,Second Owner
4337,Maruti 800 AC BSIII,2009,110000,83000,Petrol,Individual,Manual,Second Owner
4338,Hyundai Creta 1.6 CRDi SX Option,2016,865000,90000,Diesel,Individual,Manual,First Owner
4339,Renault KWID RXT,2016,225000,40000,Petrol,Individual,Manual,First Owner


### lets again read weather_csv using read_csv

In [124]:
file_path = os.path.join(os.getcwd(),"csv_data","weather_data.csv")

In [126]:
df = pd.read_csv(file_path,skiprows=2)

In [127]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/4/2017,not available,9,Sunny
2,1/5/2017,-1,not measured,Snow
3,1/6/2017,not available,7,no event
...,...,...,...,...
9,1/12/2017,26,12,Sunny
10,1/13/2017,12,12,Rainy
11,1/11/2017,-1,12,Snow
12,1/14/2017,40,-1,Sunny


In [128]:
df = pd.read_csv(file_path,header=2)

In [129]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32,6,Rain
1,1/4/2017,not available,9,Sunny
2,1/5/2017,-1,not measured,Snow
3,1/6/2017,not available,7,no event
...,...,...,...,...
9,1/12/2017,26,12,Sunny
10,1/13/2017,12,12,Rainy
11,1/11/2017,-1,12,Snow
12,1/14/2017,40,-1,Sunny
