In [None]:
import pandas as pd
import numpy as np

pd.__version__

'1.5.3'

In [None]:
# create a pandas series

s1 = pd.Series([3,4,5,6,7], dtype=float)
print(s1)
print(s1.values)
print(s1.index)
print(s1[3])

0    3.0
1    4.0
2    5.0
3    6.0
4    7.0
dtype: float64
[3. 4. 5. 6. 7.]
RangeIndex(start=0, stop=5, step=1)
6.0


In [None]:
# Create a pandas series with index parameter and access the values , indices

s2 = pd.Series([3,4,5,6,7], index=['March', 'April', 'May', 'June', 'July'])
print(s2)
print(s2.values)
print(s2.index)
print(s2.April)

March    3
April    4
May      5
June     6
July     7
dtype: int64
[3 4 5 6 7]
Index(['March', 'April', 'May', 'June', 'July'], dtype='object')
4


In [None]:
# Create a pandas dataframe from a dictionary

data_frame = pd.DataFrame({'City':['Toronto', 'Mumbai', 'Houston', 'Austin', 'College Station'],
                           'Year':[2022, 1987,2020,2015,2013]}, index = ['a','b','c','d', 'e'])
print(data_frame)

              City  Year
a          Toronto  2022
b           Mumbai  1987
c          Houston  2020
d           Austin  2015
e  College Station  2013


In [None]:
# Create a pandas dataframe from a numpy array and the values are from a standard normal distribution

arr1 = np.random.randn(5,3)

df = pd.DataFrame(arr1, columns=['A', 'B', 'C'])
print(df)

          A         B         C
0  0.101690  1.323819  0.499840
1 -0.384334  1.217441 -0.143479
2 -1.355664 -0.614983  0.278091
3 -0.206157  2.710141  0.161147
4  1.299134  1.011384  0.560531


In [None]:
# Create a dataframe from a dictionary of Series objects
cities_dict = {'California': 47364732, 'Texas': 5848774, 'Arizona': 8347348}
series_cities = pd.Series(cities_dict)
print(series_cities)
df_cities = pd.DataFrame(series_cities, columns = ['Area'])
print(df_cities)

California    47364732
Texas          5848774
Arizona        8347348
dtype: int64
                Area
California  47364732
Texas        5848774
Arizona      8347348


In [None]:
 #Create a dataframe from a 2d array
 A = pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c'])
 print(A)

        foo       bar
a  0.735321  0.695222
b  0.131802  0.853309
c  0.465276  0.179038


In [None]:
customer_dict = {'Name': ['Lily', 'Emma', 'John'],
                 'Age': [38, 15, 28],
                 'Education':['High School', 'Bachelors', 'Masters'],
                 'Gender': ['f', 'f', 'm']}
customer = pd.DataFrame(customer_dict)
print(customer)

   Name  Age    Education Gender
0  Lily   38  High School      f
1  Emma   15    Bachelors      f
2  John   28      Masters      m


In [None]:
# Use the loc and iloc methods to select the first 2 rows
print(customer.loc[:1, ['Age', 'Education']])
print(customer.iloc[:2,1:3])

   Age    Education
0   38  High School
1   15    Bachelors
   Age    Education
0   38  High School
1   15    Bachelors


In [None]:
# Using filters display a subset of the columns
customer_filter = (customer['Gender'] == 'f') & (customer['Age'] == 38)
print(customer[customer_filter])

   Name  Age    Education Gender
0  Lily   38  High School      f


In [None]:
# Find out what element is missing in the data frame
df = pd.DataFrame({"A": [1.0, 2.0, 3.0, np.NAN],
                  "B": [2.4, 6.2, 5.1, np.NAN],
                  "C": ["foo", "zoo", "bar", None]})

print(df)
print(df.isna())
print(df.notna())

     A    B     C
0  1.0  2.4   foo
1  2.0  6.2   zoo
2  3.0  5.1   bar
3  NaN  NaN  None
       A      B      C
0  False  False  False
1  False  False  False
2  False  False  False
3   True   True   True
       A      B      C
0   True   True   True
1   True   True   True
2   True   True   True
3  False  False  False


In [None]:

df_students = pd.DataFrame({"A": [1, 2, 3, 4, 7],
                            "B": [2.4, np.NaN, 5.1, np.NaN, 2.6],
                            "C": ["phd", "phd", "high school", "high school", np.NaN],
                            "D": [3.0, np.NaN, np.NaN, np.NaN, np.NaN]})
print(df_students, "\n")

# Drop rows that have atleast 1 missing value

df_students_dropped = df_students.dropna(axis=0, how = 'any')
print("Dropped dataframe = \n")
print(df_students_dropped)

# Keep rows that have atleast 3 non-missing values
df_students_keep = df_students.dropna(axis=0, thresh=3)
print("Dropped dataframe = \n")
print(df_students_keep)

   A    B            C    D
0  1  2.4          phd  3.0
1  2  NaN          phd  NaN
2  3  5.1  high school  NaN
3  4  NaN  high school  NaN
4  7  2.6          NaN  NaN 

Dropped dataframe = 

   A    B    C    D
0  1  2.4  phd  3.0
Dropped dataframe = 

   A    B            C    D
0  1  2.4          phd  3.0
2  3  5.1  high school  NaN


In [None]:
# Usefillna to handle missing values

df_students_fillna = pd.DataFrame({"A": [1, 2, 3, 4, 7],
                            "B": [2.4, np.NaN, 5.1, np.NaN, 2.6],
                            "C": ["phd", "phd", "high school", "high school", np.NaN],
                            "D": [3.0, np.NaN, np.NaN, np.NaN, np.NaN]})
print(df_students_fillna)
average_value = df_students_fillna['B'].mean()
df_students_fillna['B'].fillna(average_value, inplace=True)
df_students_fillna['D'].fillna(3.0, inplace=True)
value_mode = df_students_fillna['C'].mode()[1]
df_students_fillna['C'].fillna(value_mode, inplace=True)

print(df_students_fillna)

   A    B            C    D
0  1  2.4          phd  3.0
1  2  NaN          phd  NaN
2  3  5.1  high school  NaN
3  4  NaN  high school  NaN
4  7  2.6          NaN  NaN


In [None]:
#Sort the columns of a dataframe

df_sort_values = pd.DataFrame({"Date":['2023-09-03', '2023-10-10', '2023-08-15', '2023-09-19'],
                               "Product": ["A", "B", "C", "D"],
                               "Revenue": [4000, 1000, 3000, 2000]})
print(df_sort_values)
print("\n After Sorting: \n")
df_sort_values.sort_values(by="Revenue", ascending=True, inplace=True)
print(df_sort_values)

         Date Product  Revenue
0  2023-09-03       A     4000
1  2023-10-10       B     1000
2  2023-08-15       C     3000
3  2023-09-19       D     2000

 After Sorting: 

         Date Product  Revenue
1  2023-10-10       B     1000
3  2023-09-19       D     2000
2  2023-08-15       C     3000
0  2023-09-03       A     4000


In [None]:
#Drop Columns in a Dataframe

df_products = pd.DataFrame({"ID":['gds73', 'tir7334', 'jfk77634', '47643'],
                               "Product": ["A", "B", "C", "D"],
                               "Revenue": [4000, 1000, 3000, 2000],
                               "Region": [np.NaN, np.NaN, np.NaN, np.NaN],
                               "Expenses": [3000, 500, 2000, 2000]
                               })
df_products.drop(["ID", "Region"], axis=1, inplace=True)
print(df_products)

  Product  Revenue  Expenses
0       A     4000      3000
1       B     1000       500
2       C     3000      2000
3       D     2000      2000


In [None]:
# Add 100 to the column Expenses and create a new column Profits = Revenue - Expenses

df_products["Expenses"] = df_products["Expenses"].apply(lambda x: x + 100)
df_products["Profits"] = df_products["Revenue"] - df_products["Expenses"]

print(df_products)

  Product  Revenue  Expenses  Profits
0       A     4000      3100      900
1       B     1000       600      400
2       C     3000      2100      900
3       D     2000      2100     -100
