# Cheat Sheet on files

In [None]:
from jyquickhelper import add_notebook_menu
add_notebook_menu()

## change the encoding of a file

In [None]:
with open("essai.txt", "w", encoding="utf8") as f:
    f.write("ée\nàà")

In [None]:
from ensae_projects.data import change_encoding
change_encoding("essai.txt", "essai.utf8.txt", enc1="utf8")

1

In [None]:
with open("essai.utf8.txt", "r", encoding="utf8") as f:
    s = f.read()
print(s)

ée
àà


## select a subset of columns from a tsv files

In [None]:
import pyensae
files = pyensae.download_data("OnlineNewsPopularity.zip", 
                              website="http://archive.ics.uci.edu/ml/machine-learning-databases/00332/")
files[1]

'OnlineNewsPopularity/OnlineNewsPopularity.csv'

In [None]:
%head OnlineNewsPopularity/OnlineNewsPopularity.csv -n 2

In [None]:
from ensae_projects.data import enumerate_text_lines
def clean_column_name(s):
    return s.strip()
bigfile = enumerate_text_lines("OnlineNewsPopularity/OnlineNewsPopularity.csv", 
                               encoding="utf-8", header=True, quotes_as_str=False,
                               sep=",",
                               clean_column_name=clean_column_name, fLOG=print)

In [None]:
res = list(map(lambda row: {"LDA_00": row["LDA_00"], "title_sentiment_polarity":row["title_sentiment_polarity"]}, 
            bigfile))
len(res)

39644

In [None]:
import pandas
df = pandas.DataFrame(res)
df.head()

Unnamed: 0,LDA_00,title_sentiment_polarity
0,0.500331204081,-0.1875
1,0.799755687423,0.0
2,0.217792288518,0.0
3,0.0285732164707,0.0
4,0.0286328101715,0.136363636364


## look at the head or tail of a file

We use magic commands [%head](http://www.xavierdupre.fr/app/pyensae/helpsphinx/all_NB.html?highlight=head#head) and [%tail](http://www.xavierdupre.fr/app/pyensae/helpsphinx/all_NB.html?highlight=head#tail).

In [None]:
import pyensae

In [None]:
%head essai.txt -n 1

In [None]:
%tail essai.txt -n 1

## select lines of a flat file based on a regular expression

In [None]:
import pyensae

In [None]:
%grep essai.txt .*é.*

More complex, we extract all lines containing a substring and we add the header to the file to make it look like a dataframe.
We do that usually when we cannot load a big file into memory with [pandas](http://pandas.pydata.org/) due to the lack of memory. This code relies on magic command [grep](http://www.xavierdupre.fr/app/pyensae/helpsphinx/all_NB.html#grep) and function [enumerate_grep](http://www.xavierdupre.fr/app/pyensae/helpsphinx/pyensae/file_helper/content_helper.html?highlight=enumerate_grep#pyensae.file_helper.content_helper.enumerate_grep).

In [None]:
import pandas
import pyensae
df = pandas.DataFrame([dict(name="Dupré", first_name="Xavier"),
                       dict(name="Dupré", first_name="Sloane")])
df.to_csv("data.txt", encoding="utf8", index=False)
%head data.txt

In [None]:
raw = %grep data.txt Xavier --raw

In [None]:
raw

'Xavier,Dupré\n'

In [None]:
header = %head data.txt -n 1 --raw
header

'first_name,name\n'

In [None]:
with open("data_xavier.txt", "w", encoding="utf8") as f:
    f.write(header)
    f.write(raw)
    
%head data_xavier.txt

In [None]:
pandas.read_csv("data_xavier.txt")

Unnamed: 0,first_name,name
0,Xavier,Dupré
