In [2]:
import pandas as pd
import matplotlib.pyplot as plt


# Lecture pandas basics
- pandas.Series
- pandas.DataFrame
- read_csv
- indexing
- plotting

## Pandas Series
- can create from dictionary
- can create from list
- can create from np.array 

In [3]:
import pandas as pd

programs_dict = dict(AI = 26, NET = 38, Java = 30, UX = 28)

programs_series = pd.Series(programs_dict)
programs_series

AI      26
NET     38
Java    30
UX      28
dtype: int64

In [14]:
# extract values through indexing
print("Extract values through indexing:")
print(f"{programs_series[0] = }")
print(f"{programs_series[-1] = }")
print(f"{programs_series['UX'] = }")
print("\n")

# get keys
print("Get Keys:")
print(f"{programs_series.keys() = }")
print(f"{programs_series.keys()[0] = }")
print("\n")




Extract values through indexing:
programs_series[0] = 26
programs_series[-1] = 28
programs_series['UX'] = 28


Get Keys:
programs_series.keys() = Index(['AI', 'NET', 'Java', 'UX'], dtype='object')
programs_series.keys()[0] = 'AI'




In [17]:
import random
random.seed(1337)

dice_series = pd.Series([random.randint(1,6) for _ in range(10)])
dice_series.head(2)

0    5
1    5
dtype: int64

In [21]:

print(f"{dice_series.min() = }")
print(f"{dice_series.argmin() = }") # gives index
print(f"{dice_series.max() = }")
print(f"{dice_series.mean() = }")
print(f"{dice_series.median() = }")

dice_series.min() = 2
dice_series.argmin() = 7
dice_series.max() = 6
dice_series.mean() = 4.4
dice_series.median() = 5.0


---
## DataFrame

- tabular data with rows and columns.
- analog to 2D numpy arrays with flexible row indices and column names.
- "specialized" dictionary with col name mapped to a Series object.

In [22]:
pd.DataFrame(programs_series)

Unnamed: 0,0
AI,26
NET,38
Java,30
UX,28


In [23]:
pd.DataFrame(programs_series, columns=["Number_of_students"])

Unnamed: 0,Number_of_students
AI,26
NET,38
Java,30
UX,28


In [24]:
# create 2 series objects
students = pd.Series({"AI": 26, "NET": 38, "UX": 28, "Java": 30})
skills = pd.Series({"AI": "Python", "NET": "C#", "UX": "Figma", "Java": "JAVA"})

# create a dataframe from 2 series objects
df_programs = pd.DataFrame({"Students": students, "Skills": skills})
df_programs

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,JAVA


In [25]:
df_programs["Students"]

AI      26
NET     38
UX      28
Java    30
Name: Students, dtype: int64

In [26]:
df_programs["Students"].mean()

30.5

In [29]:
median_stu = df_programs["Students"].median()
print(f"Median students in the programs {df_programs.index.to_list()} is {median_stu:.0f}")

Median students in the programs ['AI', 'NET', 'UX', 'Java'] is 29


In [30]:
df_programs

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,JAVA


## Indexers
- loc
- iloc

In [32]:
df_programs.loc["AI"]    # By label name

Students        26
Skills      Python
Name: AI, dtype: object

In [33]:
df_programs.iloc[1:4]  # By index location

Unnamed: 0,Students,Skills
NET,38,C#
UX,28,Figma
Java,30,JAVA


## Masking

In [35]:
df_programs

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,JAVA


In [37]:
df_programs["Students"] >= 30

Unnamed: 0,Students,Skills
NET,38,C#
Java,30,JAVA


In [40]:
# Using masking to filter the dataframe

new_frame = df_programs[df_programs["Students"] >= 30]

In [43]:
new_frame

Unnamed: 0,Students,Skills
NET,38,C#
Java,30,JAVA


---
## Excel Data (Calories)


In [45]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_excel("../Data/calories.xlsx")
df.head()

Unnamed: 0,FoodCategory,FoodItem,per100grams,Cals_per100grams,KJ_per100grams
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FoodCategory      2225 non-null   object
 1   FoodItem          2225 non-null   object
 2   per100grams       2225 non-null   object
 3   Cals_per100grams  2225 non-null   object
 4   KJ_per100grams    2225 non-null   object
dtypes: object(5)
memory usage: 87.0+ KB


In [48]:
df.shape

(2225, 5)

In [52]:
# Liquid and solid food
df["FoodCategory"].unique()

array(['CannedFruit', 'Fruits', 'Tropical&ExoticFruits', 'PotatoProducts',
       'Vegetables', 'FastFood', 'Pizza', 'Cheese', 'CreamCheese',
       'Milk&DairyProducts', 'SlicedCheese', 'Yogurt', 'Beef&Veal',
       'ColdCuts&LunchMeat', 'Meat', 'Offal&Giblets', 'Pork',
       'Poultry&Fowl', 'Sausage', 'Venison&Game', 'Cakes&Pies',
       'Candy&Sweets', 'IceCream', '(Fruit)Juices',
       'AlcoholicDrinks&Beverages', 'Beer',
       'Non-AlcoholicDrinks&Beverages', 'Soda&SoftDrinks', 'Wine',
       'CerealProducts', 'Oatmeal,Muesli&Cereals', 'Pasta&Noodles',
       'Dishes&Meals', 'Soups', 'Legumes', 'Nuts&Seeds', 'Oils&Fats',
       'VegetableOils', 'BakingIngredients', 'Fish&Seafood',
       'Herbs&Spices', 'Pastries,Breads&Rolls', 'Sauces&Dressings',
       'Spreads'], dtype=object)

In [53]:
df["per100grams"].unique()

array(['100g', '100ml'], dtype=object)

## Data Cleaning and explorations
- type convert string objects with numerical values to int
- change column names
- separate into liquids and solids


In [54]:
df.head()

Unnamed: 0,FoodCategory,FoodItem,per100grams,Cals_per100grams,KJ_per100grams
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ


In [55]:
# renaming columns
df.rename(dict(Cals_per100grams = "Calories", KJ_per100grams = "kJ", per100grams = "per100"), axis=1, inplace=True)

In [57]:
df.head()

Unnamed: 0,FoodCategory,FoodItem,per100,Calories,kJ
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ
