### PANDAS DATA FRAME


Pandas DataFrame adalah struktur data dua dimensi yang digunakan untuk menyimpan data dalam bentuk tabel, mirip dengan spreadsheet atau tabel database. DataFrame memiliki label baris dan kolom, yang memungkinkan akses dan manipulasi data yang efisien.

Beberapa fitur utama dari Pandas DataFrame:
- Dapat menyimpan berbagai jenis data (numerik, string, dll.)
- Mendukung operasi aritmatika dan manipulasi data
- Dapat diimpor dari berbagai sumber data seperti CSV, Excel, SQL, dll.
- Memiliki berbagai metode untuk analisis data seperti pengelompokan, agregasi, dan transformasi

Contoh pembuatan DataFrame:

In [81]:
## Series adalah struktur data satu dimensi yang dapat menyimpan berbagai jenis data (seperti integer, string, float, dll.)
## Series mirip dengan array satu dimensi atau daftar dalam Python, tetapi memiliki kemampuan tambahan seperti label indeks
## Series dapat dibuat dari berbagai sumber data seperti daftar, array numpy, atau kamus
## Series mendukung operasi aritmatika dan manipulasi data, serta memiliki berbagai metode untuk analisis data
## Contoh penulisan Series

import pandas as pd

data =[1,2,3,4,5]
series = pd.Series(data)
print("Series \n",series)

Series 
 0    1
1    2
2    3
3    4
4    5
dtype: int64


In [82]:
## Create a series from dictionary

data = {'a':1,'b':2,'c':3}
series_dict = pd.Series(data)
print(series_dict)

a    1
b    2
c    3
dtype: int64


In [83]:
data = [10,20,30]
indeks = ['a','b','c']
pd.Series(data,index=indeks)

a    10
b    20
c    30
dtype: int64

In [84]:
## Data Frame merupakan  struktur data dua dimensi yang mirip dengan tabel dalam database atau spreadsheet 
## Contoh Penulisan Data Frame dari dictionary

data = {
    'Name': ['Raffa','Oktafian','Nuzqu'],
    'Age': [20,21,22],
    'City':['Purbalingga','New York','Chicago']
}

df = pd.DataFrame(data)
print(df,type(df))


       Name  Age         City
0     Raffa   20  Purbalingga
1  Oktafian   21     New York
2     Nuzqu   22      Chicago <class 'pandas.core.frame.DataFrame'>


In [85]:
import numpy as np

np.array(df)

array([['Raffa', 20, 'Purbalingga'],
       ['Oktafian', 21, 'New York'],
       ['Nuzqu', 22, 'Chicago']], dtype=object)

In [90]:
## Create Data Frame from a list to dictionaries

data = [
    {'Name': 'Raffa','Age':25,'City':'Bangladesh'},
    {'Name': 'Agus','Age':22,'City':'Bandung'},
    {'Name': 'Dodo','Age':21,'City':'Purwokerto'}
]

df = pd.DataFrame(data)
print(df,type(df))


    Name  Age        City
0  Raffa   25  Bangladesh
1   Agus   22     Bandung
2   Dodo   21  Purwokerto <class 'pandas.core.frame.DataFrame'>


In [87]:
## How to read a dataset

df = pd.read_csv('data.csv')
df.head(5) # menampilkan 5 baris pertama dataframe

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [92]:
## Accesing data from dataframe
df

Unnamed: 0,Name,Age,City
0,Raffa,25,Bangladesh
1,Agus,22,Bandung
2,Dodo,21,Purwokerto


In [95]:
df['Name'] #menampilkan kolom nama saja

0    Raffa
1     Agus
2     Dodo
Name: Name, dtype: object

In [96]:
df.loc[0] #menampilkan data baris pertama

Name         Raffa
Age             25
City    Bangladesh
Name: 0, dtype: object

In [97]:
df.iloc[0][2] #menampilkan baris 1 kolom kedua

  df.iloc[0][2] #menampilkan baris 1 kolom kedua


'Bangladesh'

In [98]:
## Accessing a spesific element
## AT
df.at[2,'Age'] #menampilkan indeks 1 dan kolom age

np.int64(21)

In [99]:
## Acccesing a specified element using iat

df.iat[2,2] #menampilkan data index baris ke 2 dan kolom index kedua

'Purwokerto'

In [100]:
df

Unnamed: 0,Name,Age,City
0,Raffa,25,Bangladesh
1,Agus,22,Bandung
2,Dodo,21,Purwokerto


In [110]:
## Data Manipulation w/ DataFrame
## Menambahkan Kolom baru
## Syarat kolom baru memiliki value sama dengan sebelumnya
df['Salary']=[5000,6000,7000] #menambahkan kolom baru
df

Unnamed: 0,Name,Age,City,Salary
0,Raffa,25,Bangladesh,5000
1,Agus,22,Bandung,6000
2,Dodo,21,Purwokerto,7000


In [None]:
## Remove a column
## axis = 0 yaitu mengecek baris
## axis = 1 mengecek baris dan kolom
## default axis = 0
df.drop('Salary',axis=1) # axis melakukan pengecekan data


Unnamed: 0,Name,Age,City
0,Raffa,25,Bangladesh
1,Agus,22,Bandung
2,Dodo,21,Purwokerto


In [112]:
df #ketika setelah drop kolom salary masih ada, karena drop tidak permanen

Unnamed: 0,Name,Age,City,Salary
0,Raffa,25,Bangladesh,5000
1,Agus,22,Bandung,6000
2,Dodo,21,Purwokerto,7000


In [113]:
## inplace
## Inplace digunakan untuk menghapus permanen kolom

df.drop('Salary',axis=1,inplace=True)
df

Unnamed: 0,Name,Age,City
0,Raffa,25,Bangladesh
1,Agus,22,Bandung
2,Dodo,21,Purwokerto


In [None]:
## Add age to the column

df['Age']=df['Age']+1 # menambah 1 data umur di setiap baris
df

Unnamed: 0,Name,Age,City
0,Raffa,26,Bangladesh
1,Agus,23,Bandung
2,Dodo,22,Purwokerto


In [118]:
df.drop(2,inplace=True)

In [119]:
df

Unnamed: 0,Name,Age,City
0,Raffa,26,Bangladesh
1,Agus,23,Bandung


In [120]:
df = pd.read_csv('Sales.csv')
df.head(5)

Unnamed: 0,Date,Day,Month,Year,Customer_Age,Age_Group,Customer_Gender,Country,State,Product_Category,Sub_Category,Product,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
0,2013-11-26,26,November,2013,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
1,2015-11-26,26,November,2015,19,Youth (<25),M,Canada,British Columbia,Accessories,Bike Racks,Hitch Rack - 4-Bike,8,45,120,590,360,950
2,2014-03-23,23,March,2014,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,23,45,120,1366,1035,2401
3,2016-03-23,23,March,2016,49,Adults (35-64),M,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,20,45,120,1188,900,2088
4,2014-05-15,15,May,2014,47,Adults (35-64),F,Australia,New South Wales,Accessories,Bike Racks,Hitch Rack - 4-Bike,4,45,120,238,180,418


In [None]:
# Display the data typs of each column
print("Data types:\n",df.dtypes)

# Describe the DataFrame
print("Statistical Summary:\n",df.describe())

# Group by a column and perform an aggregation
#grouped = df.groupby('Category')['Value'].mean()
#print("Mean value by category:\n",grouped)

Statistical Summary:
                  Day           Year   Customer_Age  Order_Quantity  \
count  113036.000000  113036.000000  113036.000000   113036.000000   
mean       15.665753    2014.401739      35.919212       11.901660   
std         8.781567       1.272510      11.021936        9.561857   
min         1.000000    2011.000000      17.000000        1.000000   
25%         8.000000    2013.000000      28.000000        2.000000   
50%        16.000000    2014.000000      35.000000       10.000000   
75%        23.000000    2016.000000      43.000000       20.000000   
max        31.000000    2016.000000      87.000000       32.000000   

           Unit_Cost     Unit_Price         Profit           Cost  \
count  113036.000000  113036.000000  113036.000000  113036.000000   
mean      267.296366     452.938427     285.051665     469.318695   
std       549.835483     922.071219     453.887443     884.866118   
min         1.000000       2.000000     -30.000000       1.000000   
25

In [125]:
df.describe()

Unnamed: 0,Day,Year,Customer_Age,Order_Quantity,Unit_Cost,Unit_Price,Profit,Cost,Revenue
count,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0,113036.0
mean,15.665753,2014.401739,35.919212,11.90166,267.296366,452.938427,285.051665,469.318695,754.37036
std,8.781567,1.27251,11.021936,9.561857,549.835483,922.071219,453.887443,884.866118,1309.094674
min,1.0,2011.0,17.0,1.0,1.0,2.0,-30.0,1.0,2.0
25%,8.0,2013.0,28.0,2.0,2.0,5.0,29.0,28.0,63.0
50%,16.0,2014.0,35.0,10.0,9.0,24.0,101.0,108.0,223.0
75%,23.0,2016.0,43.0,20.0,42.0,70.0,358.0,432.0,800.0
max,31.0,2016.0,87.0,32.0,2171.0,3578.0,15096.0,42978.0,58074.0
