In [None]:
import csv

input_file = 'G_D.csv'
output_file = 'G_D_clean.csv'

expected_columns = 13  # You have 13 columns based on the header row

with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8', newline='') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile, quoting=csv.QUOTE_MINIMAL)

    header = next(reader)  # Read and write the header
    writer.writerow(header)

    buffer = []
    for line in infile:
        line = line.strip('\n')

        # Add this line to our buffer
        buffer.append(line)
        joined_line = ' '.join(buffer)  # Join buffered lines with space to avoid merging words

        try:
            row = next(csv.reader([joined_line]))

            if len(row) == expected_columns:
                writer.writerow(row)
                buffer = []  # Reset buffer
            else:
                # Still incomplete row: keep buffering
                continue
        except Exception:
            continue  # In case of decoding errors etc.

    if buffer:
        print("Warning: leftover data that didn't parse cleanly:", buffer)


In [2]:
import pandas as pd

df = pd.read_csv('G_D_clean.csv')

df.head()

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ChefSam,Sunshine State,Culinarian | Hot Sauce Artisan | Kombucha Brew...,2011-03-23 03:50:13,4680,2643,6232,False,2023-03-01 23:59:59,Which #bitcoin books should I think about read...,['bitcoin'],Twitter for iPhone,False
1,Roy⚡️,,Truth-seeking pleb 📚 • Science 🧪 • Nature 🌱☀️ ...,2022-01-30 17:41:41,770,1145,9166,False,2023-03-01 23:59:47,"@ThankGodForBTC I appreciate the message, but ...",['Bitcoin'],Twitter for iPhone,False
2,Ethereum Yoda,,UP or DOWN... . . . . Price matters NOT.,2022-07-24 04:50:18,576,1,0,False,2023-03-01 23:59:42,#Ethereum price update: #ETH $1664.02 USD #B...,"['Ethereum', 'ETH', 'Bitcoin', 'BTC', 'altcoin...",Twitter Web App,False
3,Viction,"Paris, France",https://t.co/8M3rgdjwEe #bitcoin #blockchain ...,2010-03-26 10:15:26,236,1829,2195,False,2023-03-01 23:59:36,CoinDashboard v3.0 is here Available on ios an...,['Bitcoin'],Twitter for Android,False
4,Rosie,London,"The flower language of jasmine is loyalty, res...",2013-02-16 09:57:56,12731,46,134,False,2023-03-01 23:59:32,#Bitcoin Short Term Fractal (4H)💥 In lower ti...,"['Bitcoin', 'BTC']",Twitter Web App,False


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169820 entries, 0 to 169819
Data columns (total 13 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   user_name         169818 non-null  object
 1   user_location     84162 non-null   object
 2   user_description  158756 non-null  object
 3   user_created      169820 non-null  object
 4   user_followers    169820 non-null  int64 
 5   user_friends      169820 non-null  int64 
 6   user_favourites   169820 non-null  int64 
 7   user_verified     169820 non-null  bool  
 8   date              169820 non-null  object
 9   text              169820 non-null  object
 10  hashtags          169820 non-null  object
 11  source            169820 non-null  object
 12  is_retweet        169820 non-null  bool  
dtypes: bool(2), int64(3), object(8)
memory usage: 14.6+ MB


In [32]:
df.isnull().sum()

user_name               2
user_location       85658
user_description    11064
user_created            0
user_followers          0
user_friends            0
user_favourites         0
user_verified           0
date                    0
text                    0
hashtags                0
source                  0
is_retweet              0
dtype: int64

In [33]:
df[['text', 'user_name', 'user_description']].head()

Unnamed: 0,text,user_name,user_description
0,Which #bitcoin books should I think about read...,ChefSam,Culinarian | Hot Sauce Artisan | Kombucha Brew...
1,"@ThankGodForBTC I appreciate the message, but ...",Roy⚡️,Truth-seeking pleb 📚 • Science 🧪 • Nature 🌱☀️ ...
2,#Ethereum price update: #ETH $1664.02 USD #B...,Ethereum Yoda,UP or DOWN... . . . . Price matters NOT.
3,CoinDashboard v3.0 is here Available on ios an...,Viction,https://t.co/8M3rgdjwEe #bitcoin #blockchain ...
4,#Bitcoin Short Term Fractal (4H)💥 In lower ti...,Rosie,"The flower language of jasmine is loyalty, res..."


In [None]:
import re

# Fonction pour nettoyer un texte
def remove_emojis(text):
    if isinstance(text, str):
        return re.sub(r'[^\w\s]', '', text)
    return text

# Appliquer à plusieurs colonnes
cols_to_clean = ['text', 'user_name', 'user_description']
df[cols_to_clean] = df[cols_to_clean].map(remove_emojis)

# Afficher les 5 premières lignes
df[cols_to_clean].head()

Unnamed: 0,text,user_name,user_description
0,Which bitcoin books should I think about readi...,ChefSam,Culinarian Hot Sauce Artisan Kombucha Brewer...
1,ThankGodForBTC I appreciate the message but no...,Roy,Truthseeking pleb Science Nature stacke...
2,Ethereum price update ETH 166402 USD Bitcoin...,Ethereum Yoda,UP or DOWN Price matters NOT
3,CoinDashboard v30 is here Available on ios and...,Viction,httpstco8M3rgdjwEe bitcoin blockchain crypto ...
4,Bitcoin Short Term Fractal 4H In lower timefr...,Rosie,The flower language of jasmine is loyalty resp...
