In [1]:
import pandas as pd
import numpy as np

## 2 Main datatypes (Series, Dataframe):

In [2]:
cars = pd.Series(["BMW", "Audi", "Mercedes", "Toyota"])
colors = pd.Series(["white", "blue", "red", "white"])
print(cars)
print("-------------")
print(colors)

0         BMW
1        Audi
2    Mercedes
3      Toyota
dtype: object
-------------
0    white
1     blue
2      red
3    white
dtype: object


#### Data Frames (made up of series'):

In [3]:
car_data = pd.DataFrame({"Car Model" : cars, "Color" : colors})
car_data


Unnamed: 0,Car Model,Color
0,BMW,white
1,Audi,blue
2,Mercedes,red
3,Toyota,white


### Importing Data:

In [4]:
# use read_csv to create dataframe object with csv file
car_sales = pd.read_csv("car-sales.csv")

In [5]:
# print car_sales dataframe 
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


## Describing Data:

In [6]:
# See Feature data-types
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object

## Getting Mean values:

In [7]:
# How to convert from pandas dataframe to csv (use index=False to not export index column)
car_sales.to_csv("exported-car-sales.csv", index=False)
car_sales["Price"] = car_sales["Price"].str.replace("$", "")
car_sales["Price"] = car_sales["Price"].str.replace(",", "")
car_sales["Price"] = pd.to_numeric(car_sales["Price"])

In [8]:
car_sales.mean(numeric_only=True)

Odometer (KM)    78601.4
Doors                4.0
Price             7645.0
dtype: float64

In [9]:
# Sum up a column in the data set
car_sales["Doors"].sum()

40

In [10]:
# get the length of the current dataframe (num rows)
len(car_sales)

10

## Getting Specific Data:

In [11]:
# Query specific rows based on index by splicing
car_sales.iloc[3:5]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
3,BMW,Black,11179,5,22000.0
4,Nissan,White,213095,4,3500.0


In [12]:
# Query Based on specific value
car_sales[car_sales["Make"] == "Toyota"]


Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,4000.0
2,Toyota,Blue,32549,3,7000.0
5,Toyota,Green,99213,4,4500.0
8,Toyota,White,60000,4,6250.0


In [13]:
# Only query cars with more than 75,000 miles:
car_sales[car_sales["Odometer (KM)"] > 75000]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,4000.0
1,Honda,Red,87899,4,5000.0
4,Nissan,White,213095,4,3500.0
5,Toyota,Green,99213,4,4500.0


In [14]:
# Crosstabing to compare specific features
pd.crosstab(car_sales["Make"], car_sales["Colour"])

Colour,Black,Blue,Green,Red,White
Make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BMW,1,0,0,0,0
Honda,0,2,0,1,0
Nissan,0,0,0,0,2
Toyota,0,1,1,0,2


In [15]:
# Making a smaller dataframe using specific columns 
new_df = car_sales[["Colour", "Price", "Doors"]]
new_df

Unnamed: 0,Colour,Price,Doors
0,White,4000.0,4
1,Red,5000.0,4
2,Blue,7000.0,3
3,Black,22000.0,5
4,White,3500.0,4
5,Green,4500.0,4
6,Blue,7500.0,4
7,Blue,7000.0,4
8,White,6250.0,4
9,White,9700.0,4


In [16]:
# Combing Logic (|,&) (or, and)
new_df = car_sales[((car_sales["Make"] == "Toyota") | (car_sales["Make"] == "Nissan")) & (car_sales["Colour"] == "White")]
new_df

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,4000.0
4,Nissan,White,213095,4,3500.0
8,Toyota,White,60000,4,6250.0
9,Nissan,White,31600,4,9700.0


In [17]:
# Looking for multiple values in the same column:
nissan_toyota = car_sales[car_sales["Make"].isin(["Toyota", "Nissan"])]
nissan_toyota

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,4000.0
2,Toyota,Blue,32549,3,7000.0
4,Nissan,White,213095,4,3500.0
5,Toyota,Green,99213,4,4500.0
8,Toyota,White,60000,4,6250.0
9,Nissan,White,31600,4,9700.0
