# **Creating data using Files**

In [109]:
from google.colab import files

**creating random file**

In [110]:
with open('test.txt', 'w') as file:
  file.write('hello world')

In [111]:
with open('test.txt','r') as file:
  content = file.read()
  print(content)

hello world


# Creating CSV data

In [112]:
import csv

In [113]:
data = [
    ['Name', 'Age', 'City', 'Score'],
    ['Alice', 23, 'New York', 85],
    ['Bob', 29, 'Los Angeles', 90],
    ['Charlie', 35, 'Chicago', 75],
    ['David', 42, 'Houston', 88],
    ['Eva', 30, 'Phoenix', 95]
]

In [114]:
file_name = 'data.csv'

with open(file_name,'w',newline='') as file:
  writer = csv.writer(file)
  writer.writerows(data)

In [115]:
with open(file_name,'r') as file:
  content = file.read()
  print('Content of the CSV file:')
  print(content)

Content of the CSV file:
Name,Age,City,Score
Alice,23,New York,85
Bob,29,Los Angeles,90
Charlie,35,Chicago,75
David,42,Houston,88
Eva,30,Phoenix,95



# Creating TSV Data

In [116]:
file_name1 = 'data1.tsv'

with open(file_name1,'w',newline='') as file:
  writer = csv.writer(file,delimiter='\t')
  writer.writerows(data)



In [117]:
with open(file_name1,'r') as file:
  contents = file.read()
  print('Content of the TSV file:')
  print(content)

Content of the TSV file:
Name,Age,City,Score
Alice,23,New York,85
Bob,29,Los Angeles,90
Charlie,35,Chicago,75
David,42,Houston,88
Eva,30,Phoenix,95



# CSV Working

In [118]:
import pandas as pd

In [119]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,Name,Age,City,Score
0,Alice,23,New York,85
1,Bob,29,Los Angeles,90
2,Charlie,35,Chicago,75
3,David,42,Houston,88
4,Eva,30,Phoenix,95


 *opening a csv file from a url*

In [120]:
import requests
from io import StringIO

url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
response = requests.get(url)
data = StringIO(response.text)

df = pd.read_csv(data)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Sep Parameter

In [121]:
df2 = pd.read_csv('data1.tsv',sep=',')          # to read csv, if sep='\t' then it reads tsv data
df2.head()

Unnamed: 0,Name\tAge\tCity\tScore
0,Alice\t23\tNew York\t85
1,Bob\t29\tLos Angeles\t90
2,Charlie\t35\tChicago\t75
3,David\t42\tHouston\t88
4,Eva\t30\tPhoenix\t95


In [122]:
df2 = pd.read_csv('data1.tsv',sep='\t')          # to read csv, if sep='\t' then it reads tsv data
df2.head()

Unnamed: 0,Name,Age,City,Score
0,Alice,23,New York,85
1,Bob,29,Los Angeles,90
2,Charlie,35,Chicago,75
3,David,42,Houston,88
4,Eva,30,Phoenix,95


# Adding a label at top data which does not have a column name

In [123]:
data2 = [
    ['Alice', 23, 'New York', 85],
    ['Bob', 29, 'Los Angeles', 90],
    ['Charlie', 35, 'Chicago', 75],
    ['David', 42, 'Houston', 88],
    ['Eva', 30, 'Phoenix', 95]
]

In [124]:
with open('data3.csv', 'w', newline='') as file:
  writer = csv.writer(file)
  writer.writerows(data2)

In [125]:
pd.read_csv('data3.csv')


Unnamed: 0,Alice,23,New York,85
0,Bob,29,Los Angeles,90
1,Charlie,35,Chicago,75
2,David,42,Houston,88
3,Eva,30,Phoenix,95


In [126]:
pd.read_csv('data3.csv', names=['Name', 'Age', 'City', 'Score'])


Unnamed: 0,Name,Age,City,Score
0,Alice,23,New York,85
1,Bob,29,Los Angeles,90
2,Charlie,35,Chicago,75
3,David,42,Houston,88
4,Eva,30,Phoenix,95


# Index_col Parameter

suppose we have a column by default that can be used as index then we can remove the default index column

In [127]:
pd.read_csv('data.csv', index_col='Score')

Unnamed: 0_level_0,Name,Age,City
Score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
85,Alice,23,New York
90,Bob,29,Los Angeles
75,Charlie,35,Chicago
88,David,42,Houston
95,Eva,30,Phoenix


# Header Parameter

In [128]:
pd.read_csv('data.csv', header=2, names=['Name', 'Age', 'City', 'Score']) # this method can be used to trim data from top

Unnamed: 0,Name,Age,City,Score
0,Charlie,35,Chicago,75
1,David,42,Houston,88
2,Eva,30,Phoenix,95


# use_cols parameter

In [129]:
pd.read_csv('data.csv', usecols=['Name', 'Age'])

Unnamed: 0,Name,Age
0,Alice,23
1,Bob,29
2,Charlie,35
3,David,42
4,Eva,30


# Squeeze parameter

it has converted dataframe to series data

In [130]:
age_data = pd.read_csv('data.csv', usecols=['Name'])
age_data.squeeze()


0      Alice
1        Bob
2    Charlie
3      David
4        Eva
Name: Name, dtype: object

# Skipwors/nrows

In [131]:
pd.read_csv('data.csv', skiprows=[])

Unnamed: 0,Name,Age,City,Score
0,Alice,23,New York,85
1,Bob,29,Los Angeles,90
2,Charlie,35,Chicago,75
3,David,42,Houston,88
4,Eva,30,Phoenix,95


# Encoding

In [132]:
pd.read_csv('data.csv', encoding= 'unicode_escape')

Unnamed: 0,Name,Age,City,Score
0,Alice,23,New York,85
1,Bob,29,Los Angeles,90
2,Charlie,35,Chicago,75
3,David,42,Houston,88
4,Eva,30,Phoenix,95


# dtypes

In [133]:
pd.read_csv('data.csv').info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    5 non-null      object
 1   Age     5 non-null      int64 
 2   City    5 non-null      object
 3   Score   5 non-null      int64 
dtypes: int64(2), object(2)
memory usage: 288.0+ bytes


In [134]:
pd.read_csv('data.csv', dtype={'Age': float}).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     5 non-null      float64
 2   City    5 non-null      object 
 3   Score   5 non-null      int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 288.0+ bytes


# **Handling Dates**

In [137]:

from datetime import datetime, timedelta

# Read the existing CSV file
df = pd.read_csv('data.csv')

# Define a starting date
start_date = datetime(2023, 1, 1)

# Create a date range
date_range = [start_date + timedelta(days=i) for i in range(len(df))]

# Add the date column to the dataframe
df['Date'] = date_range

# Save the modified dataframe to a new CSV file
df.to_csv('data.csv', index=False)

# Display the modified dataframe
print(df)




      Name  Age         City  Score       Date
0    Alice   23     New York     85 2023-01-01
1      Bob   29  Los Angeles     90 2023-01-02
2  Charlie   35      Chicago     75 2023-01-03
3    David   42      Houston     88 2023-01-04
4      Eva   30      Phoenix     95 2023-01-05


In [138]:
pd.read_csv('data.csv', parse_dates=['Date']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Name    5 non-null      object        
 1   Age     5 non-null      int64         
 2   City    5 non-null      object        
 3   Score   5 non-null      int64         
 4   Date    5 non-null      datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 328.0+ bytes
