In [1]:
import os, sys, mimetypes, io, pandas as pd, numpy, mimetypes
import matplotlib.pyplot as plt
%matplotlib inline

from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
pd.set_option("display.max_columns", None)

In [2]:
class FormatClassifier():
    def __init__(self, discard=None):
        self.content=pd.DataFrame()
        self.mimetype={}
        #Hyper parameter(s)
        self.discard = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-_ ' if discard is None else discard

    def _preprocess(self, filename=None, string=None):
        """ Get  char frequency """
       
        #Char count
        if filename is not None: 
            with open(filename, 'r+') as f: string = f.read(50000)
        assert string is not None
        count = {}
        wc, r = 0, False
        for c in string:
            if c in self.discard: 
                if not r: (r, wc) = (True, wc+1)
            else: 
                count[c] = count[c]+1 if c in count else 1
                r=False
        #Char frequency
        size = sum(count.values()  )
        count['density'] = size/wc if wc != 0 else 100 #len(string)
        glyph = lambda c, non_printable = {'\n':'CR', '\t':'TAB', ' ': 'SP'}: non_printable[c] if c in non_printable else c     #Quick-win to display special char
        return { glyph(k):round(v/size,4)*100 for k,v in count.items() } if size !=0 else {}
    
    #Retrieve frequency of special chars for each file type
    def train(self, directory):
        
        #Loop examples
        all_examples, d= [], os.path.sep
        for filetype in os.listdir( directory ):
            if not os.path.isdir( directory+d+filetype ): continue
            for example in os.listdir( directory+d+filetype ):
                filename = directory+d+filetype+d+example
                
                #Use mimetype for binary files
                mimetype, encoding = mimetypes.guess_type(filename)
                if mimetype is not None and mimetype != 'text/plain':
                    if mimetype in self.mimetype: assert self.mimetype[mimetype] == filetype, self.mimetype
                    else: 
                        self.mimetype[mimetype] = filetype
                        continue
                        
                #Pre-processing for text file
                entry = self._preprocess( filename=filename  )
                #Add expected category for supervision purpose
                entry.update( {'filetype': filetype, 'filename': filename})
                all_examples.append(  entry )

        #Featurize: X = columns, Y = rows['filetype'] -- filename is kept only for debugging
        df = pd.DataFrame(all_examples).fillna(0).set_index(['filetype', 'filename'])
                
        #Store result
        self.content = df
        
        return df
    
    def estimate(self, string=None, filename=None, debug=False):
        if filename is not None:
            mimetype, encoding = mimetypes.guess_type(filename)
            if mimetype is not None and mimetype != 'text/plain': return self.mimetype[mimetype]
        #Preprocess
        x = self._preprocess(filename=filename, string=string)
        #Pseudo-Featurization
        x = pd.Series( [ x[k] if k in x else 0 for k in self.content.columns.values] , index= self.content.columns.values )
        #Estimate combinations
        df = pd.DataFrame([ (filetype, filename,  numpy.linalg.norm(x-row)+1, *(x-row).values ) for (filetype, filename), row in self.content.iterrows() ] )
        #Rank estimation results
        df = df.sort_values(by=[2], ascending=True)
        
        #Return estimation(s)
        if debug:
            df.columns = ['filetype', 'filename', 'distance'] + [ i for i in self.content.columns.values]
            return df.replace(0,'-').set_index(['filetype', 'filename'])
        else:
            return df.iloc[0,0]


In [5]:
model=  FormatClassifier(discard=None)
df=model.train( r'C:\_data\Lubies\Python\alita\examples')
#model.estimate( r'C:\_data\tmp\toto.txt', debug=True)
debug = model.estimate( string='http:toto:elel\nelke:llklk:', debug=True)
#model.estimate( filename=r'C:\_data\Lubies\Python\alita\examples\bin_excel\excel_blank.xlsx')
debug.head(20)
#model.estimate(  r'C:\_data\tmp\empty.txt'  )
# df, x_encode, x_decode, y_encode, y_decode = model._encode(['xml','ini'])

Unnamed: 0_level_0,Unnamed: 1_level_0,distance,"""",#,$,&,',(,),*,+,",",.,/,:,;,<,=,>,?,@,CR,TAB,[,\,],density,{,|,}
filetype,filename,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
csv_column,C:\_data\Lubies\Python\alita\examples\csv_column\csv.txt,18.14,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,-,17.14,-,-,-
address_ip,C:\_data\Lubies\Python\alita\examples\address_ip\ipv6.txt,30.26175,-,-,-,-,-,-,-,-,-,-,-,-,-20,-,-,-,-,-,-,20,-,-,-,-,7.5,-,-,-
text_sql,C:\_data\Lubies\Python\alita\examples\text_sql\sql_0003.txt,67.635868,-6.45,-,-,-,-19.35,-12.9,-12.9,-,-6.45,-6.45,-,-,57.42,-,-,-,-,-,-,7.1,-,-,-,-,16.15,-,-,-
text_infa,C:\_data\Lubies\Python\alita\examples\text_infa\005.param,72.943349,-,-,-18.37,-,-,-,-,-,-,-2.04,-15.31,-4.08,62.65,-,-,-15.31,-,-,-2.04,4.69,-,-5.1,-,-5.1,18.59,-,-,-
text_infa,C:\_data\Lubies\Python\alita\examples\text_infa\004.param,73.150132,-,-,-17.48,-,-,-,-,-,-,-1.94,-14.56,-5.83,63.5,-,-,-14.56,-,-,-1.94,2.52,-,-4.85,-,-4.85,18.68,-,-,-
text_json,C:\_data\Lubies\Python\alita\examples\text_json\0001.txt,80.763658,-29.41,-,-,-,-,-,-,-,-,-5.88,-,-,68.24,-,-,-,-,-,-,2.35,-23.53,-,-,-,13.33,-5.88,-,-5.88
address_url,C:\_data\Lubies\Python\alita\examples\address_url\localhost.txt,81.779437,-,-,-,-,-,-,-,-,-,-,-14.29,-57.14,51.43,-,-,-,-,-,-,20,-,-,-,-,3.33,-,-,-
text_json,C:\_data\Lubies\Python\alita\examples\text_json\0004.txt,85.031311,-44.19,-,-,-,-,-3.49,-3.49,-,-,-8.14,-,-,67.21,-,-,-,-,-,-,8.37,-,-1.16,-,-1.16,18.18,-6.98,-,-6.98
text_json,C:\_data\Lubies\Python\alita\examples\text_json\0002.txt,86.464245,-44.44,-,-,-,-,-,-,-,-,-8.33,-5.56,-,68.89,-,-,-,-,-,-5.56,11.67,-,-2.78,-,-2.78,15.45,-5.56,-,-5.56
text_json,C:\_data\Lubies\Python\alita\examples\text_json\0003.txt,86.496533,-44.83,-,-,-,-,-,-,-,-,-8.62,-5.17,-,67.93,-,-,-,-,-,-5.17,13.1,-,-1.72,-,-1.72,16.88,-6.9,-,-6.9


In [None]:
model.content