In [1]:
# In this notebook we will study file handling in Python.
# First we will study how to handle a CSV file using the CSV module
# Method 1: Using csv.reader (reads rows as lists)
#
# The input file Auto.csv has a header and 397 data points arranged row-wise
# with 9 features/descriptors explained as follows:
#
# mpg: miles per gallon
# cylinders: Number of cylinders between 4 and 8
# displacement: Engine displacement (cu. inches)
# horsepower: Engine horsepower
# weight: Vehicle weight (lbs.)
# acceleration: Time to accelerate from 0 to 60 mph (sec.)
# year: Model year (modulo 100)
# origin: Origin of car (1. American, 2. European, 3. Japanese)
# name: Vehicle name
#
import csv
filename = "Auto.csv"  # Replace with your CSV file name
with open(filename, 'r', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
#
# If your CSV has a header row and you want to skip it:
#
    header = next(csv_reader)
    print(f"Header: {header}")

# Iterate through the remaining rows
    for row in csv_reader:
        print(row)

Header: ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin', 'name']
['18', '8', '307', '130', '3504', '12', '70', '1', 'chevrolet chevelle malibu']
['15', '8', '350', '165', '3693', '11.5', '70', '1', 'buick skylark 320']
['18', '8', '318', '150', '3436', '11', '70', '1', 'plymouth satellite']
['16', '8', '304', '150', '3433', '12', '70', '1', 'amc rebel sst']
['17', '8', '302', '140', '3449', '10.5', '70', '1', 'ford torino']
['15', '8', '429', '198', '4341', '10', '70', '1', 'ford galaxie 500']
['14', '8', '454', '220', '4354', '9', '70', '1', 'chevrolet impala']
['14', '8', '440', '215', '4312', '8.5', '70', '1', 'plymouth fury iii']
['14', '8', '455', '225', '4425', '10', '70', '1', 'pontiac catalina']
['15', '8', '390', '190', '3850', '8.5', '70', '1', 'amc ambassador dpl']
['15', '8', '383', '170', '3563', '10', '70', '1', 'dodge challenger se']
['14', '8', '340', '160', '3609', '8', '70', '1', "plymouth 'cuda 340"]
['15', '8', '400', '1

In [2]:
#Now we read the same file as dictionaries instead of lists
import csv
filename = "Auto.csv"  # Replace with your CSV file name
with open(filename, 'r', newline='') as csvfile:
        csvreader = csv.DictReader(csvfile)

        for row in csvreader:
            print(row)  # Each row is a dictionary where keys are header names
 

{'mpg': '18', 'cylinders': '8', 'displacement': '307', 'horsepower': '130', 'weight': '3504', 'acceleration': '12', 'year': '70', 'origin': '1', 'name': 'chevrolet chevelle malibu'}
{'mpg': '15', 'cylinders': '8', 'displacement': '350', 'horsepower': '165', 'weight': '3693', 'acceleration': '11.5', 'year': '70', 'origin': '1', 'name': 'buick skylark 320'}
{'mpg': '18', 'cylinders': '8', 'displacement': '318', 'horsepower': '150', 'weight': '3436', 'acceleration': '11', 'year': '70', 'origin': '1', 'name': 'plymouth satellite'}
{'mpg': '16', 'cylinders': '8', 'displacement': '304', 'horsepower': '150', 'weight': '3433', 'acceleration': '12', 'year': '70', 'origin': '1', 'name': 'amc rebel sst'}
{'mpg': '17', 'cylinders': '8', 'displacement': '302', 'horsepower': '140', 'weight': '3449', 'acceleration': '10.5', 'year': '70', 'origin': '1', 'name': 'ford torino'}
{'mpg': '15', 'cylinders': '8', 'displacement': '429', 'horsepower': '198', 'weight': '4341', 'acceleration': '10', 'year': '70

In [3]:
#Let us now use Pandas library for reading the .csv file and for subsequent data manipulation
#
import pandas as pd
import numpy as np
Auto = pd.read_csv('Auto.csv')
Auto

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
392,27.0,4,140.0,86,2790,15.6,82,1,ford mustang gl
393,44.0,4,97.0,52,2130,24.6,82,2,vw pickup
394,32.0,4,135.0,84,2295,11.6,82,1,dodge rampage
395,28.0,4,120.0,79,2625,18.6,82,1,ford ranger


In [4]:
import pandas as pd
import numpy as np
Auto = pd.read_csv('Auto.csv')
Auto['horsepower']


0      130
1      165
2      150
3      150
4      140
      ... 
392     86
393     52
394     84
395     79
396     82
Name: horsepower, Length: 397, dtype: object

In [5]:
# Using np.unique function, we can eliminate the repeated values
#
import pandas as pd
import numpy as np
Auto = pd.read_csv('Auto.csv')
np.unique(Auto['horsepower'])

array(['100', '102', '103', '105', '107', '108', '110', '112', '113',
       '115', '116', '120', '122', '125', '129', '130', '132', '133',
       '135', '137', '138', '139', '140', '142', '145', '148', '149',
       '150', '152', '153', '155', '158', '160', '165', '167', '170',
       '175', '180', '190', '193', '198', '200', '208', '210', '215',
       '220', '225', '230', '46', '48', '49', '52', '53', '54', '58',
       '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70',
       '71', '72', '74', '75', '76', '77', '78', '79', '80', '81', '82',
       '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93',
       '94', '95', '96', '97', '98', '?'], dtype=object)

In [6]:
#We see that ? is being used to encode some missing values. Therefore, we declare '?'
#as NaN (not a number) or like a whitespace or a comma (,)
import numpy as np
Auto = pd.read_csv('Auto.csv',na_values =['?'])
Auto['horsepower'].sum()

40952.0

In [7]:
import pandas as pd
import numpy as np
Auto = pd.read_csv('Auto.csv',na_values =['?'])
Auto.shape

(397, 9)

In [8]:
# Now we declare a new data type Auto_new created by removing all the rows carrying ?
import pandas as pd
import numpy as np
Auto = pd.read_csv('Auto.csv',na_values =['?'])
Auto_new = Auto.dropna ()
Auto_new.shape

(392, 9)

In [9]:
# We write down the column headers for the corrected data
#
import pandas as pd
import numpy as np
Auto = pd.read_csv('Auto.csv',na_values =['?'])
Auto_new = Auto.dropna ()
Auto = Auto_new
Auto.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin', 'name'],
      dtype='object')

In [10]:
# Let us write out the first three rows of the new data 
#
import pandas as pd
import numpy as np
Auto = pd.read_csv('Auto.csv',na_values =['?'])
Auto_new = Auto.dropna ()
Auto = Auto_new
Auto.columns
Auto [:3]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite


In [15]:
#Here we print out those rows in which the car model was more recent
#than 1980
import pandas as pd
import numpy as np
Auto = pd.read_csv('Auto.csv',na_values =['?'])
Auto_new = Auto.dropna ()
Auto = Auto_new
idx_80 = Auto['year'] > 80
#idx_80
Auto[idx_80]

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
338,27.2,4,135.0,84.0,2490,15.7,81,1,plymouth reliant
339,26.6,4,151.0,84.0,2635,16.4,81,1,buick skylark
340,25.8,4,156.0,92.0,2620,14.4,81,1,dodge aries wagon (sw)
341,23.5,6,173.0,110.0,2725,12.6,81,1,chevrolet citation
342,30.0,4,135.0,84.0,2385,12.9,81,1,plymouth reliant
343,39.1,4,79.0,58.0,1755,16.9,81,3,toyota starlet
344,39.0,4,86.0,64.0,1875,16.4,81,1,plymouth champ
345,35.1,4,81.0,60.0,1760,16.1,81,3,honda civic 1300
346,32.3,4,97.0,67.0,2065,17.8,81,3,subaru
347,37.0,4,85.0,65.0,1975,19.4,81,3,datsun 210 mpg


In [16]:
#We can also print out the data of a few selected columns
#than 1980
import pandas as pd
import numpy as np
Auto = pd.read_csv('Auto.csv',na_values =['?'])
Auto_new = Auto.dropna ()
Auto = Auto_new
Auto [['mpg', 'horsepower']]

Unnamed: 0,mpg,horsepower
0,18.0,130.0
1,15.0,165.0
2,18.0,150.0
3,16.0,150.0
4,17.0,140.0
...,...,...
392,27.0,86.0
393,44.0,52.0
394,32.0,84.0
395,28.0,79.0


In [17]:
#There are a huge number of functions in Pandas which are useful for data manipulation
#I urge you to explore those on your own.
#Here we learn how to write the data in a csv file using Pandas
# First, we have to convert the given data into a dataframe, and then output that 
# data using 'to_csv' command
import pandas as pd
import numpy as np
Auto = pd.read_csv('Auto.csv',na_values =['?'])
Auto_new = Auto.dropna ()
Auto = Auto_new
#Now the new corrected file is converted into a dataframe
df = pd.DataFrame(Auto)
#The dataframe is written into a new file called Auto_new.csv
df.to_csv('Auto_new.csv')