In [1]:
import pandas as pd
import numpy as np
import csv
import re
import ast

In [2]:
pd.options.display.max_columns = False


In [3]:
chunk = pd.read_csv('../../data/raw.csv', engine='c',sep=',', encoding='utf-8', on_bad_lines='skip', chunksize=11000)

In [4]:
df = chunk.get_chunk(11000)

In [5]:
selected_cols = ['title', 'createdAt',
       'cities', 'price', 'smallDescription', 'store.name',
]

In [6]:
df = df[selected_cols]

In [7]:
df['createdAt'] = pd.to_datetime(df['createdAt'])
df.price = pd.to_numeric(df.price, downcast='integer')

In [8]:
df.dtypes

title                            object
createdAt           datetime64[ns, UTC]
cities                           object
price                           float64
smallDescription                 object
store.name                       object
dtype: object

In [9]:
city = pd.json_normalize(df.cities.apply(ast.literal_eval).explode()).drop(columns=['id', 'slug', '__typename', 'region.id', 'region.slug', 'region.__typename' ]).rename(columns={'name':'Town', 'region.name':'Wilaya'})
area = pd.json_normalize(df.smallDescription.apply(ast.literal_eval).explode()).drop(columns= ['__typename']).rename(columns={'valueText': 'Area'})

In [10]:
area.Area = area.Area.astype(str)

In [11]:
def to_integer(s: str) -> pd.Int64Dtype(): 
    match = re.search(r'(\d+)', s or "")
    return int(match.group(1)) if match else pd.NA

area.Area = area.Area.apply(to_integer)

In [12]:
df = pd.concat([df, city, area], axis=1)

In [13]:
df = df.drop(columns=['cities', 'smallDescription'])

In [14]:
df = df.rename(columns ={'title': 'Title', 'createdAt': 'Date', 'store.name': 'Store', 'price': 'Price'})

In [15]:
df = df.iloc[:11000]

In [16]:
df['FlatType'] = df['Title'].str.extract(r'\b(F\d+)\b', expand=False)

In [17]:
mask = df['Title'].str.contains(r'\b(Location|Vente)\b', regex=True, na=False, case=False)

  mask = df['Title'].str.contains(r'\b(Location|Vente)\b', regex=True, na=False, case=False)


In [18]:
df = df[~(mask == False)]

In [19]:
mask = df['Title'].str.contains(r'\b(Location)\b', regex=True, na=False, case=False)

  mask = df['Title'].str.contains(r'\b(Location)\b', regex=True, na=False, case=False)


In [21]:
df["isForRent"] = mask

In [22]:
df

Unnamed: 0,Title,Date,Price,Store,Town,Wilaya,Area,FlatType,isForRent
0,Vente Appartement F4 Alger Bordj el kiffan,2025-07-22 15:56:01+00:00,15000000.0,Le Roi Immo,Bordj el kiffan,Alger,114,F4,False
1,Vente Appartement F2 Alger Cheraga,2025-07-22 15:55:36+00:00,,BINAM IMMOBILIER,Cheraga,Alger,53,F2,False
2,Location Appartement F3 Annaba Annaba,2025-07-22 15:53:18+00:00,50000.0,Agence Immobilière Bône Adresse Annaba,Annaba,Annaba,,F3,True
3,Vente Appartement F4 Alger Mohammadia,2025-07-22 15:53:18+00:00,32500.0,Sultan Real Estate,Mohammadia,Alger,112,F4,False
4,Vente Appartement F3 Alger Bordj el bahri,2025-07-22 15:53:10+00:00,20000000.0,Sultan Real Estate,Bordj el bahri,Alger,90,F3,False
...,...,...,...,...,...,...,...,...,...
10995,Vente Appartement F4 Alger Cheraga,2025-07-22 14:46:02+00:00,,BM Promotion,Cheraga,Alger,380,F4,False
10996,Vente Appartement F4 Alger Cheraga,2025-07-22 14:46:01+00:00,,BM Promotion,Cheraga,Alger,,F4,False
10997,Vente Villa Mostaganem Kheireddine,2025-07-22 12:20:12+00:00,,,Kheireddine,Mostaganem,79,,False
10998,Location vacances Appartement F3 Béjaïa Bejaia,2025-07-22 12:19:50+00:00,4500.0,,Bejaia,Béjaïa,,F3,True


In [23]:
df.to_csv("../../data/processed_data.csv", mode ='a', header='header')

In [24]:
import datetime
print(datetime.datetime.now())

2025-07-25 14:53:54.837412
